diff --git a/.bumpversion.cfg b/.bumpversion.cfg index e2769671..34815c52 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -4,4 +4,6 @@ commit = True tag = True tag_name = {new_version} +[bumpversion:file:docs/conf.py] + [bumpversion:file:scrapy_zyte_api/__version__.py] diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 66643b86..ee74037b 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -62,7 +62,7 @@ jobs: fail-fast: false matrix: python-version: ["3.11"] - tox-job: ["mypy", "linters", "twine-check"] + tox-job: ["mypy", "linters", "twine-check", "docs"] steps: - uses: actions/checkout@v3 diff --git a/.readthedocs.yml b/.readthedocs.yml new file mode 100644 index 00000000..1519565e --- /dev/null +++ b/.readthedocs.yml @@ -0,0 +1,12 @@ +version: 2 +formats: all +sphinx: + configuration: docs/conf.py +build: + os: ubuntu-22.04 + tools: + python: "3.11" # Keep in sync with .github/workflows/test.yml +python: + install: + - requirements: docs/requirements.txt + - path: . diff --git a/CHANGES.rst b/CHANGES.rst index eb7a655e..b8b19745 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -167,7 +167,7 @@ TBR cookiejar of the request. * A new boolean setting, ``ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED``, can be - set to ``True`` to enable automated mapping of cookies from a request + set to ``True`` to enable automatic mapping of cookies from a request cookiejar into the ``experimental.requestCookies`` Zyte API parameter. * ``ZyteAPITextResponse`` is now a subclass of ``HtmlResponse``, so that the @@ -239,10 +239,10 @@ When upgrading, you should set the following in your Scrapy settings: be set to ``True`` to make all requests use Zyte API by default, with request parameters being automatically mapped to Zyte API parameters. * Add a Request meta key, ``zyte_api_automap``, that can be used to enable - automated request parameter mapping for specific requests, or to modify the - outcome of automated request parameter mapping for specific requests. + automatic request parameter mapping for specific requests, or to modify the + outcome of automatic request parameter mapping for specific requests. * Add a ``ZYTE_API_AUTOMAP_PARAMS`` setting, which is a counterpart for - ``ZYTE_API_DEFAULT_PARAMS`` that applies to requests where automated request + ``ZYTE_API_DEFAULT_PARAMS`` that applies to requests where automatic request parameter mapping is enabled. * Add the ``ZYTE_API_SKIP_HEADERS`` and ``ZYTE_API_BROWSER_HEADERS`` settings to control the automatic mapping of request headers. diff --git a/README.rst b/README.rst index 5bf84d52..ba8a178b 100644 --- a/README.rst +++ b/README.rst @@ -18,978 +18,13 @@ scrapy-zyte-api :target: https://codecov.io/gh/scrapy-plugins/scrapy-zyte-api :alt: Coverage report +.. description starts -Scrapy plugin for `Zyte API`_. +Scrapy plugin for seamless `Zyte API`_ integration. .. _Zyte API: https://docs.zyte.com/zyte-api/get-started.html +.. description ends -Requirements -============ - -* Python 3.7+ -* Scrapy 2.0.1+ - -scrapy-poet integration requires more recent software: - -* Python 3.8+ -* Scrapy 2.6+ - -Installation -============ - -.. code-block:: - - pip install scrapy-zyte-api - - -Quick start -=========== - -Get a `Zyte API`_ key, and add it to your project settings.py: - -.. code-block:: python - - ZYTE_API_KEY = "YOUR_API_KEY" - -Instead of adding API key to setting.py you can also set -``ZYTE_API_KEY`` environment variable. - -Then, set up the scrapy-zyte-api integration: - -.. code-block:: python - - DOWNLOAD_HANDLERS = { - "http": "scrapy_zyte_api.ScrapyZyteAPIDownloadHandler", - "https": "scrapy_zyte_api.ScrapyZyteAPIDownloadHandler", - } - DOWNLOADER_MIDDLEWARES = { - "scrapy_zyte_api.ScrapyZyteAPIDownloaderMiddleware": 1000, - } - REQUEST_FINGERPRINTER_CLASS = "scrapy_zyte_api.ScrapyZyteAPIRequestFingerprinter" - SPIDER_MIDDLEWARES = { - "scrapy_zyte_api.ScrapyZyteAPISpiderMiddleware": 100, - } - TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" - -By default, scrapy-zyte-api doesn't change the spider behavior. -To switch your spider to use Zyte API for all requests, -set the following option: - -.. code-block:: python - - ZYTE_API_TRANSPARENT_MODE = True - -Configuration -============= - -To enable this plugin: - -- Set the ``http`` and ``https`` keys in the `DOWNLOAD_HANDLERS - `_ - Scrapy setting to ``"scrapy_zyte_api.ScrapyZyteAPIDownloadHandler"``. - -- Add ``"scrapy_zyte_api.ScrapyZyteAPIDownloaderMiddleware"`` to the - `DOWNLOADER_MIDDLEWARES - `_ - Scrapy setting with any value, e.g. ``1000``. - -- Add ``"scrapy_zyte_api.ScrapyZyteAPISpiderMiddleware"`` to the - `SPIDER_MIDDLEWARES - `_ - Scrapy setting with any value, e.g. ``100``. - -- Set the `REQUEST_FINGERPRINTER_CLASS - `_ - Scrapy setting to ``"scrapy_zyte_api.ScrapyZyteAPIRequestFingerprinter"``. - -- Set the `TWISTED_REACTOR - `_ - Scrapy setting to - ``"twisted.internet.asyncioreactor.AsyncioSelectorReactor"``. - - .. note:: On existing projects that were not using the asyncio Twisted - reactor, your existing code may need changes, such as: - - - `Handling a pre-installed Twisted reactor - `_. - - Some Twisted imports install the default, non-asyncio Twisted - reactor as a side effect. Once a reactor is installed, it cannot be - changed for the whole run time. - - - `Converting Twisted Deferreds into asyncio Futures - `_. - - Note that you might be using Deferreds without realizing it through - some Scrapy functions and methods. For example, when you yield the - return value of ``self.crawler.engine.download()`` from a spider - callback, you are yielding a Deferred. - -- Set `your Zyte API key - `_ as - either the ``ZYTE_API_KEY`` Scrapy setting or as an environment variable of - the same name. - -The ``ZYTE_API_ENABLED`` setting, which is ``True`` by default, can be set to -``False`` to disable this plugin. - -If you want to use scrapy-poet integration, add a provider to -``SCRAPY_POET_PROVIDERS`` (see `scrapy-poet integration`_): - -.. code-block:: python - - SCRAPY_POET_PROVIDERS = { - "scrapy_zyte_api.providers.ZyteApiProvider": 1100, - } - -Usage -===== - -You can send requests through Zyte API in one of the following ways: - -- Send all request through Zyte API by default, letting Zyte API parameters - be chosen automatically based on your Scrapy request parameters. See - `Using transparent mode`_. - -- Send specific requests through Zyte API, setting all Zyte API parameters - manually, keeping full control of what is sent to Zyte API. - See `Sending requests with manually-defined parameters`_. - -- Send specific requests through Zyte API, letting Zyte API parameters be - chosen automatically based on your Scrapy request parameters. - See `Sending requests with automatically-mapped parameters`_. - -Zyte API response parameters are mapped into Scrapy response parameters where -possible. See `Response mapping`_ for details. - - -Using transparent mode ----------------------- - -Set the ``ZYTE_API_TRANSPARENT_MODE`` `Scrapy setting`_ to ``True`` to handle -Scrapy requests as follows: - -.. _Scrapy setting: https://docs.scrapy.org/en/latest/topics/settings.html - -- By default, requests are sent through Zyte API with automatically-mapped - parameters. See `Sending requests with automatically-mapped parameters`_ - for details about automatic request parameter mapping. - - You do not need to set the ``zyte_api_automap`` request meta key to - ``True``, but you can set it to a dictionary to extend your Zyte API - request parameters. - -- Requests with the ``zyte_api`` request meta key set to a ``dict`` are sent - through Zyte API with manually-defined parameters. - See `Sending requests with manually-defined parameters`_. - -- Requests with the ``zyte_api_automap`` request meta key set to ``False`` - are *not* sent through Zyte API. - -For example: - -.. code-block:: python - - import scrapy - - - class SampleQuotesSpider(scrapy.Spider): - name = "sample_quotes" - start_urls = ["https://quotes.toscrape.com/"] - - custom_settings = { - "ZYTE_API_TRANSPARENT_MODE": True, - } - - def parse(self, response): - print(response.text) - # "…" - - -Sending requests with manually-defined parameters -------------------------------------------------- - -To send a Scrapy request through Zyte API with manually-defined parameters, -define your Zyte API parameters in the ``zyte_api`` key in -`Request.meta `_ -as a ``dict``. - -The only exception is the ``url`` parameter, which should not be defined as a -Zyte API parameter. The value from ``Request.url`` is used automatically. - -For example: - -.. code-block:: python - - import scrapy - - - class SampleQuotesSpider(scrapy.Spider): - name = "sample_quotes" - - def start_requests(self): - yield scrapy.Request( - url="https://quotes.toscrape.com/", - meta={ - "zyte_api": { - "browserHtml": True, - } - }, - ) - - def parse(self, response): - print(response.text) - # "…" - -Note that response headers are necessary for raw response decoding. When -defining parameters manually and requesting ``httpResponseBody`` extraction, -remember to also request ``httpResponseHeaders`` extraction: - -.. code-block:: python - - import scrapy - - - class SampleQuotesSpider(scrapy.Spider): - name = "sample_quotes" - - def start_requests(self): - yield scrapy.Request( - url="https://quotes.toscrape.com/", - meta={ - "zyte_api": { - "httpResponseBody": True, - "httpResponseHeaders": True, - } - }, - ) - - def parse(self, response): - print(response.text) - # "…" - -To learn more about Zyte API parameters, see the `data extraction usage`_ and -`API reference`_ pages of the `Zyte API documentation`_. - -.. _API reference: https://docs.zyte.com/zyte-api/openapi.html -.. _data extraction usage: https://docs.zyte.com/zyte-api/usage/extract.html -.. _Zyte API documentation: https://docs.zyte.com/zyte-api/get-started.html - - -Sending requests with automatically-mapped parameters ------------------------------------------------------ - -To send a Scrapy request through Zyte API letting Zyte API parameters be -automatically chosen based on the parameters of that Scrapy request, set the -``zyte_api_automap`` key in -`Request.meta `_ -to ``True``. - -For example: - -.. code-block:: python - - import scrapy - - - class SampleQuotesSpider(scrapy.Spider): - name = "sample_quotes" - - def start_requests(self): - yield scrapy.Request( - url="https://quotes.toscrape.com/", - meta={ - "zyte_api_automap": True, - }, - ) - - def parse(self, response): - print(response.text) - # "…" - -See also `Using transparent mode`_ and `Automated request parameter mapping`_. - - -Response mapping ----------------- - -Zyte API responses are mapped with one of the following classes: - -- ``scrapy_zyte_api.responses.ZyteAPITextResponse``, a subclass of - ``scrapy.http.TextResponse``, is used to map text responses, i.e. responses - with ``browserHtml`` or responses with both ``httpResponseBody`` and - ``httpResponseHeaders`` with a text body (e.g. plain text, HTML, JSON). - -- ``scrapy_zyte_api.responses.ZyteAPIResponse``, a subclass of - ``scrapy.http.Response``, is used to map any other response. - -Zyte API response parameters are mapped into response class attributes where -possible: - -- ``url`` becomes ``response.url``. - -- ``statusCode`` becomes ``response.status``. - -- ``httpResponseHeaders`` and ``experimental.responseCookies`` become - ``response.headers``. - -- ``experimental.responseCookies`` is also mapped into the request cookiejar. - -- ``browserHtml`` and ``httpResponseBody`` are mapped into both - ``response.text`` (``str``) and ``response.body`` (``bytes``). - - If none of these parameters were present, e.g. if the only requested output - was ``screenshot``, ``response.text`` and ``response.body`` would be empty. - - If a future version of Zyte API supported requesting both outputs on the - same request, and both parameters were present, ``browserHtml`` would be - the one mapped into ``response.text`` and ``response.body``. - -Both response classes have a ``raw_api_response`` attribute that contains a -``dict`` with the complete, raw response from Zyte API, where you can find all -Zyte API response parameters, including those that are not mapped into other -response class atttributes. - -For example, for a request for ``httpResponseBody`` and -``httpResponseHeaders``, you would get: - -.. code-block:: python - - def parse(self, response): - print(response.url) - # "https://quotes.toscrape.com/" - print(response.status) - # 200 - print(response.headers) - # {b"Content-Type": [b"text/html"], …} - print(response.text) - # "…" - print(response.body) - # b"…" - print(response.raw_api_response) - # { - # "url": "https://quotes.toscrape.com/", - # "statusCode": 200, - # "httpResponseBody": "PGh0bWw+4oCmPC9odG1sPg==", - # "httpResponseHeaders": […], - # } - -For a request for ``screenshot``, on the other hand, the response would look -as follows: - -.. code-block:: python - - def parse(self, response): - print(response.url) - # "https://quotes.toscrape.com/" - print(response.status) - # 200 - print(response.headers) - # {} - print(response.text) - # "" - print(response.body) - # b"" - print(response.raw_api_response) - # { - # "url": "https://quotes.toscrape.com/", - # "statusCode": 200, - # "screenshot": "iVBORw0KGgoAAAANSUh…", - # } - from base64 import b64decode - print(b64decode(response.raw_api_response["screenshot"])) - # b'\x89PNG\r\n\x1a\n\x00\x00\x00\r…' - - -Automated request parameter mapping ------------------------------------ - -When you enable automated request parameter mapping, be it through transparent -mode (see `Using transparent mode`_) or for a specific request (see -`Sending requests with automatically-mapped parameters`_), Zyte API -parameters are chosen as follows by default: - -- ``Request.url`` becomes ``url``, same as in requests with manually-defined - parameters. - -- If ``Request.method`` is something other than ``"GET"``, it becomes - ``httpRequestMethod``. - -- ``Request.headers`` become ``customHttpRequestHeaders``. - -- ``Request.body`` becomes ``httpRequestBody``. - -- If the ``ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED`` Scrapy setting is - ``True``, the COOKIES_ENABLED_ Scrapy setting is ``True`` (default), and - provided request metadata does not set dont_merge_cookies_ to ``True``: - - .. _COOKIES_ENABLED: https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#std-setting-COOKIES_ENABLED - .. _dont_merge_cookies: https://docs.scrapy.org/en/latest/topics/request-response.html#std-reqmeta-dont_merge_cookies - - - ``experimental.responseCookies`` is set to ``True``. - - - Cookies from the request `cookie jar`_ become - ``experimental.requestCookies``. - - .. _cookie jar: https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#std-reqmeta-cookiejar - - All cookies from the cookie jar are set, regardless of their cookie - domain. This is because Zyte API requests may involve requests to - different domains (e.g. when following cross-domain redirects, or - during browser rendering). - - If the cookies to be set exceed the limit defined in the - ``ZYTE_API_MAX_COOKIES`` setting (100 by default), a warning is logged, - and only as many cookies as the limit allows are set for the target - request. To silence this warning, set ``experimental.requestCookies`` - manually, e.g. to an empty dict. Alternatively, if Zyte API starts - supporting more than 100 request cookies, update the - ``ZYTE_API_MAX_COOKIES`` setting accordingly. - - If you are using a custom downloader middleware to handle request - cookiejars, you can point the ``ZYTE_API_COOKIE_MIDDLEWARE`` setting to - its import path to make scrapy-zyte-api work with it. The downloader - middleware is expected to have a ``jars`` property with the same - signature as in the built-in Scrapy downloader middleware for cookie - handling. - -- ``httpResponseBody`` and ``httpResponseHeaders`` are set to ``True``. - - This is subject to change without prior notice in future versions of - scrapy-zyte-api, so please account for the following: - - - If you are requesting a binary resource, such as a PDF file or an - image file, set ``httpResponseBody`` to ``True`` explicitly in your - requests: - - .. code-block:: python - - Request( - url="https://toscrape.com/img/zyte.png", - meta={ - "zyte_api_automap": {"httpResponseBody": True}, - }, - ) - - In the future, we may stop setting ``httpResponseBody`` to ``True`` by - default, and instead use a different, new Zyte API parameter that only - works for non-binary responses (e.g. HMTL, JSON, plain text). - - - If you need to access response headers, be it through - ``response.headers`` or through - ``response.raw_api_response["httpResponseHeaders"]``, set - ``httpResponseHeaders`` to ``True`` explicitly in your requests: - - .. code-block:: python - - Request( - url="https://toscrape.com/", - meta={ - "zyte_api_automap": {"httpResponseHeaders": True}, - }, - ) - - At the moment we request response headers because some response headers - are necessary to properly decode the response body as text. In the - future, Zyte API may be able to handle this decoding automatically, so - we would stop setting ``httpResponseHeaders`` to ``True`` by default. - -For example, the following Scrapy request: - -.. code-block:: python - - Request( - method="POST" - url="https://httpbin.org/anything", - headers={"Content-Type": "application/json"}, - body=b'{"foo": "bar"}', - cookies={"a": "b"}, - ) - -Results in a request to the Zyte API data extraction endpoint with the -following parameters: - -.. code-block:: javascript - - { - "customHttpRequestHeaders": [ - { - "name": "Content-Type", - "value": "application/json" - } - ], - "experimental": { - "requestCookies": [ - { - "name": "a", - "value": "b", - "domain": "" - } - ], - "responseCookies": true - }, - "httpResponseBody": true, - "httpResponseHeaders": true, - "httpRequestBody": "eyJmb28iOiAiYmFyIn0=", - "httpRequestMethod": "POST", - "url": "https://httpbin.org/anything" - } - -You may set the ``zyte_api_automap`` key in -`Request.meta `_ -to a ``dict`` of Zyte API parameters to extend or override choices made by -automated request parameter mapping. - -Enabling ``browserHtml``, ``screenshot``, or an automatic extraction property, -unsets ``httpResponseBody`` and ``httpResponseHeaders``, and makes -``Request.headers`` become ``requestHeaders`` instead of -``customHttpRequestHeaders``. For example, the following Scrapy request: - -.. code-block:: python - - Request( - url="https://quotes.toscrape.com", - headers={"Referer": "https://example.com/"}, - meta={"zyte_api_automap": {"browserHtml": True}}, - ) - -Results in a request to the Zyte API data extraction endpoint with the -following parameters: - -.. code-block:: javascript - - { - "browserHtml": true, - "experimental": { - "responseCookies": true - }, - "requestHeaders": {"referer": "https://example.com/"}, - "url": "https://quotes.toscrape.com" - } - -When mapping headers, headers not supported by Zyte API are excluded from the -mapping by default. Use the following `Scrapy settings`_ to change which -headers are included or excluded from header mapping: - -.. _Scrapy settings: https://docs.scrapy.org/en/latest/topics/settings.html - -- ``ZYTE_API_SKIP_HEADERS`` determines headers that must *not* be mapped as - ``customHttpRequestHeaders``, and its default value is: - - .. code-block:: python - - ["User-Agent"] - -- ``ZYTE_API_BROWSER_HEADERS`` determines headers that *can* be mapped as - ``requestHeaders``. It is a ``dict``, where keys are header names and - values are the key that represents them in ``requestHeaders``. Its default - value is: - - .. code-block:: python - - {"Referer": "referer"} - -To maximize support for potential future changes in Zyte API, automated -request parameter mapping allows some parameter values and parameter -combinations that Zyte API does not currently support, and may never support: - -- ``Request.method`` becomes ``httpRequestMethod`` even for unsupported_ - ``httpRequestMethod`` values, and even if ``httpResponseBody`` is unset. - - .. _unsupported: https://docs.zyte.com/zyte-api/usage/extract.html#zyte-api-set-method - -- You can set ``customHttpRequestHeaders`` or ``requestHeaders`` to ``True`` - to force their mapping from ``Request.headers`` in scenarios where they - would not be mapped otherwise. - - Conversely, you can set ``customHttpRequestHeaders`` or ``requestHeaders`` - to ``False`` to prevent their mapping from ``Request.headers``. - -- ``Request.body`` becomes ``httpRequestBody`` even if ``httpResponseBody`` - is unset. - -- You can set ``httpResponseBody`` to ``False`` (which unsets the parameter), - and not set ``browserHtml`` or ``screenshot`` to ``True``. In this case, - ``Request.headers`` is mapped as ``requestHeaders``. - -- You can set ``httpResponseBody`` to ``True`` and also set ``browserHtml`` - or ``screenshot`` to ``True``. In this case, ``Request.headers`` is mapped - both as ``customHttpRequestHeaders`` and as ``requestHeaders``, and - ``browserHtml`` is used as the Scrapy response body. - - -Setting default parameters -========================== - -Often the same configuration needs to be used for all Zyte API requests. For -example, all requests may need to set the same geolocation, or the spider only -uses ``browserHtml`` requests. - -The following settings allow you to define Zyte API parameters to be included -in all requests: - -- ``ZYTE_API_DEFAULT_PARAMS`` is a ``dict`` of parameters to be combined with - manually-defined parameters. See `Sending requests with manually-defined parameters`_. - - You may set the ``zyte_api`` request meta key to an empty ``dict`` to only - use default parameters for that request. - -- ``ZYTE_API_AUTOMAP_PARAMS`` is a ``dict`` of parameters to be combined with - automatically-mapped parameters. - See `Sending requests with automatically-mapped parameters`_. - -For example, if you set ``ZYTE_API_DEFAULT_PARAMS`` to -``{"geolocation": "US"}`` and ``zyte_api`` to ``{"browserHtml": True}``, -``{"url: "…", "geolocation": "US", "browserHtml": True}`` is sent to Zyte API. - -Parameters in these settings are merged with request-specific parameters, with -request-specific parameters taking precedence. - -``ZYTE_API_DEFAULT_PARAMS`` has no effect on requests that use automated -request parameter mapping, and ``ZYTE_API_AUTOMAP_PARAMS`` has no effect on -requests that use manually-defined parameters. - -When using transparent mode (see `Using transparent mode`_), be careful -of which parameters you define through ``ZYTE_API_AUTOMAP_PARAMS``. In -transparent mode, all Scrapy requests go through Zyte API, even requests that -Scrapy sends automatically, such as those for ``robots.txt`` files when -ROBOTSTXT_OBEY_ is ``True``, or those for sitemaps when using a `sitemap -spider`_. Certain parameters, like ``browserHtml`` or ``screenshot``, are not -meant to be used for every single request. - -If the ``zyte_api_default_params`` request meta key is set to ``False``, the -value of the ``ZYTE_API_DEFAULT_PARAMS`` setting for this request is ignored. - -.. _ROBOTSTXT_OBEY: https://docs.scrapy.org/en/latest/topics/settings.html#robotstxt-obey -.. _sitemap spider: https://docs.scrapy.org/en/latest/topics/spiders.html#sitemapspider - - -Customizing the retry policy -============================ - -API requests are retried automatically using the default retry policy of -`python-zyte-api`_. - -API requests that exceed retries are dropped. You cannot manage API request -retries through Scrapy downloader middlewares. - -Use the ``ZYTE_API_RETRY_POLICY`` setting or the ``zyte_api_retry_policy`` -request meta key to override the default `python-zyte-api`_ retry policy with a -custom retry policy. - -A custom retry policy must be an instance of `tenacity.AsyncRetrying`_. - -Scrapy settings must be picklable, which `retry policies are not -`_, so you cannot assign retry -policy objects directly to the ``ZYTE_API_RETRY_POLICY`` setting, and must use -their import path string instead. - -When setting a retry policy through request meta, you can assign the -``zyte_api_retry_policy`` request meta key either the retry policy object -itself or its import path string. If you need your requests to be serializable, -however, you may also need to use the import path string. - -For example, to increase the maximum number of retries to 10 before dropping -the API request, you can subclass RetryFactory_ as follows: - -.. code-block:: python - - # project/retry_policies.py - from tenacity import stop_after_attempt - from zyte_api.aio.retry import RetryFactory - - class CustomRetryFactory(RetryFactory): - temporary_download_error_stop = stop_after_attempt(10) - - CUSTOM_RETRY_POLICY = CustomRetryFactory().build() - - # project/settings.py - ZYTE_API_RETRY_POLICY = "project.retry_policies.CUSTOM_RETRY_POLICY" - - -To extend this retry policy, so it will also retry HTTP 521 errors, the same -as HTTP 520 errors, you can implement: - -.. code-block:: python - - # project/retry_policies.py - from tenacity import retry_if_exception, RetryCallState, stop_after_attempt - from zyte_api.aio.errors import RequestError - from zyte_api.aio.retry import RetryFactory - - def is_http_521(exc: BaseException) -> bool: - return isinstance(exc, RequestError) and exc.status == 521 - - class CustomRetryFactory(RetryFactory): - - retry_condition = ( - RetryFactory.retry_condition - | retry_if_exception(is_http_521) - ) - temporary_download_error_stop = stop_after_attempt(10) - - def wait(self, retry_state: RetryCallState) -> float: - if is_http_521(retry_state.outcome.exception()): - return self.temporary_download_error_wait(retry_state=retry_state) - return super().wait(retry_state) - - def stop(self, retry_state: RetryCallState) -> bool: - if is_http_521(retry_state.outcome.exception()): - return self.temporary_download_error_stop(retry_state) - return super().stop(retry_state) - - CUSTOM_RETRY_POLICY = CustomRetryFactory().build() - - # project/settings.py - ZYTE_API_RETRY_POLICY = "project.retry_policies.CUSTOM_RETRY_POLICY" - -.. _python-zyte-api: https://github.com/zytedata/python-zyte-api -.. _RetryFactory: https://github.com/zytedata/python-zyte-api/blob/main/zyte_api/aio/retry.py -.. _tenacity.AsyncRetrying: https://tenacity.readthedocs.io/en/latest/api.html#tenacity.AsyncRetrying - - -Misc settings -============= - -- ``ZYTE_API_MAX_REQUESTS`` - - Default: ``None`` - - When set to an integer value > 0, the spider will close when the number of - Zyte API requests reaches it. - - Note that requests with error responses that cannot be retried or exceed - their retry limit also count here. - - -Stats -===== - -Stats from python-zyte-api_ are exposed as Scrapy stats with the -``scrapy-zyte-api`` prefix. - -For example, ``scrapy-zyte-api/status_codes/`` stats indicate the -status code of Zyte API responses (e.g. ``429`` for `rate limiting -`_ or -``520`` for `temporary download errors -`_). - -.. note:: The actual status code that is received from the target website, i.e. - the `statusCode - `_ - response field of a `Zyte API successful response - `_, - is accounted for in the ``downloader/response_status_count/`` - stat, as with any other Scrapy response. - - -Request fingerprinting -====================== - -The request fingerprinter class of this plugin ensures that Scrapy 2.7 and -later generate unique `request fingerprints -`_ -for Zyte API requests based on some of their parameters. - -For example, a request for ``browserHtml`` and a request for ``screenshot`` -with the same target URL are considered different requests. Similarly, requests -with the same target URL but different ``actions`` are also considered -different requests. - -Zyte API parameters that affect request fingerprinting ------------------------------------------------------- - -The request fingerprinter class of this plugin generates request fingerprints -for Zyte API requests based on the following Zyte API parameters: - -- ``url`` (`canonicalized `_) - - For URLs that include a URL fragment, like ``https://example.com#foo``, URL - canonicalization keeps the URL fragment if ``browserHtml`` or - ``screenshot`` are enabled. - -- Request attribute parameters (``httpRequestBody``, - ``httpRequestMethod``) - -- Output parameters (``browserHtml``, ``httpResponseBody``, - ``httpResponseHeaders``, ``screenshot``) - -- Rendering option parameters (``actions``, ``javascript``, - ``screenshotOptions``) - -- ``geolocation`` - -The following Zyte API parameters are *not* taken into account for request -fingerprinting: - -- Request header parameters (``customHttpRequestHeaders``, - ``requestHeaders``) - -- Metadata parameters (``echoData``, ``jobId``) - -- Experimental parameters (``experimental``) - - -Changing the fingerprinting of non-Zyte-API requests ----------------------------------------------------- - -You can assign a request fingerprinter class to the -``ZYTE_API_FALLBACK_REQUEST_FINGERPRINTER_CLASS`` Scrapy setting to configure -a custom request fingerprinter class to use for requests that do not go through -Zyte API: - -.. code-block:: python - - ZYTE_API_FALLBACK_REQUEST_FINGERPRINTER_CLASS = "custom.RequestFingerprinter" - -By default, requests that do not go through Zyte API use the default request -fingerprinter class of the installed Scrapy version. - - -Request fingerprinting before Scrapy 2.7 ----------------------------------------- - -If you have a Scrapy version older than Scrapy 2.7, Zyte API parameters are not -taken into account for request fingerprinting. This can cause some Scrapy -components, like the filter of duplicate requests or the HTTP cache extension, -to interpret 2 different requests as being the same. - -To avoid most issues, use automated request parameter mapping, either through -transparent mode or setting ``zyte_api_automap`` to ``True`` in -``Request.meta``, and then use ``Request`` attributes instead of -``Request.meta`` as much as possible. Unlike ``Request.meta``, ``Request`` -attributes do affect request fingerprints in Scrapy versions older than Scrapy -2.7. - -For requests that must have the same ``Request`` attributes but should still -be considered different, such as browser-based requests with different URL -fragments, you can set ``dont_filter`` to ``True`` on ``Request.meta`` to -prevent the duplicate filter of Scrapy to filter any of them out. For example: - -.. code-block:: python - - yield Request( - "https://toscrape.com#1", - meta={"zyte_api_automap": {"browserHtml": True}}, - dont_filter=True, - ) - yield Request( - "https://toscrape.com#2", - meta={"zyte_api_automap": {"browserHtml": True}}, - dont_filter=True, - ) - -Note, however, that for other Scrapy components, like the HTTP cache -extensions, these 2 requests would still be considered identical. - - -Logging request parameters -========================== - -Set the ``ZYTE_API_LOG_REQUESTS`` setting to ``True`` and the ``LOG_LEVEL`` -setting to ``"DEBUG"`` to enable the logging of debug messages that indicate -the JSON object sent on every extract request to Zyte API. - -For example:: - - Sending Zyte API extract request: {"url": "https://example.com", "httpResponseBody": true} - -The ``ZYTE_API_LOG_REQUESTS_TRUNCATE``, 64 by default, determines the maximum -length of any string value in the logged JSON object, excluding object keys. To -disable truncation, set it to 0. - -scrapy-poet integration -======================= - -``scrapy-zyte-api`` includes a `scrapy-poet provider`_ that you can use to get -data from Zyte API in page objects. It requires additional dependencies which -you can get by installing the optional ``provider`` feature: -``pip install scrapy-zyte-api[provider]``. Enable the provider in the Scrapy -settings:: - - SCRAPY_POET_PROVIDERS = { - "scrapy_zyte_api.providers.ZyteApiProvider": 1100, - } - -Request some supported dependencies in the page object:: - - @attrs.define - class ProductPage(BasePage): - response: BrowserResponse - product: Product - - - class ZyteApiSpider(scrapy.Spider): - ... - - def parse_page(self, response: DummyResponse, page: ProductPage): - ... - -Or request them directly in the callback:: - - class ZyteApiSpider(scrapy.Spider): - ... - - def parse_page(self, - response: DummyResponse, - browser_response: BrowserResponse, - product: Product, - ): - ... - -The currently supported dependencies are: - -* ``web_poet.BrowserHtml`` -* ``web_poet.BrowserResponse`` -* ``zyte_common_items.Product`` -* ``zyte_common_items.ProductList`` -* ``zyte_common_items.ProductNavigation`` -* ``zyte_common_items.Article`` -* ``zyte_common_items.ArticleList`` -* ``zyte_common_items.ArticleNavigation`` - -The provider will make a request to Zyte API using the ``ZYTE_API_KEY`` and -``ZYTE_API_URL`` settings. - -The provider will ignore the transparent mode and parameter mapping settings. -To add extra parameters to all Zyte API requests sent by the provider, set them -as a dictionary through the ``ZYTE_API_PROVIDER_PARAMS`` setting, for example -in ``settings.py``:: - - ZYTE_API_PROVIDER_PARAMS = {"geolocation": "IE"} - -When the ``ZYTE_API_PROVIDER_PARAMS`` setting includes one of the Zyte API -extraction options (e.g. ``productOptions`` for ``product``), but the -final Zyte API request doesn't include the corresponding data type, the -unused options are automatically removed. So, it's safe to use -``ZYTE_API_PROVIDER_PARAMS`` to set the default options for various extraction -types, e.g.:: - - ZYTE_API_PROVIDER_PARAMS = { - "productOptions": {"extractFrom": "httpResponseBody"}, - "productNavigationOptions": {"extractFrom": "httpResponseBody"}, - } - -Note that the built-in ``scrapy_poet.page_input_providers.ItemProvider`` has a -priority of 2000, so when you have page objects producing -``zyte_common_items.Product`` items you should use higher values for -``ZyteApiProvider`` if you want these items to come from these page objects, -and lower values if you want them to come from Zyte API. - -Currently, when ``ItemProvider`` is used together with ``ZyteApiProvider``, -it may make more requests than is optimal: the normal Scrapy response will be -always requested even when using a ``DummyResponse`` annotation, and in some -dependency combinations two Zyte API requests will be made for the same page. -We are planning to solve these problems in the future releases of -``scrapy-poet`` and ``scrapy-zyte-api``. - -.. _scrapy-poet provider: https://scrapy-poet.readthedocs.io/en/stable/providers.html - - -Running behind a proxy -====================== - -If you require a proxy to access Zyte API (e.g. a corporate proxy), configure -the ``HTTP_PROXY`` and ``HTTPS_PROXY`` environment variables accordingly, and -set the ``ZYTE_API_USE_ENV_PROXY`` setting to ``True``. +* Documentation: https://scrapy-zyte-api.readthedocs.io/en/latest/ +* License: BSD 3-clause diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 00000000..d4bb2cbb --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/_ext/__init__.py b/docs/_ext/__init__.py new file mode 100644 index 00000000..41814277 --- /dev/null +++ b/docs/_ext/__init__.py @@ -0,0 +1,66 @@ +import re + +from docutils import nodes +from docutils.parsers.rst.roles import set_classes + + +def http_api_reference_role( + name, rawtext, text, lineno, inliner, options={}, content=[] +): + match = re.search( + r"(?s)^(.+?)\s*<\s*((?:request|response):[a-zA-Z.]+)\s*>\s*$", text + ) + if match: + display_text = match[1] + reference = match[2] + else: + display_text = None + reference = text + if reference.startswith("request:"): + request_or_response = "request" + elif reference.startswith("response:"): + request_or_response = "response/200" + else: + raise ValueError( + f":http: directive reference must start with request: or " + f"response:, got {reference} from {text!r}." + ) + + field = reference.split(":", maxsplit=1)[1] + if not display_text: + display_text = field + refuri = ( + f"https://docs.zyte.com/zyte-api/usage/reference.html" + f"#operation/extract/{request_or_response}/{field}" + ) + set_classes(options) + node = nodes.reference(rawtext, display_text, refuri=refuri, **options) + return [node], [] + + +def setup(app): + app.add_role("http", http_api_reference_role) + # https://stackoverflow.com/a/13663325 + # + # Scrapy’s + # https://github.com/scrapy/scrapy/blob/dba37674e6eaa6c2030c8eb35ebf8127cd488062/docs/_ext/scrapydocs.py#L90C16-L110C6 + app.add_crossref_type( + directivename="setting", + rolename="setting", + indextemplate="pair: %s; setting", + ) + app.add_crossref_type( + directivename="signal", + rolename="signal", + indextemplate="pair: %s; signal", + ) + app.add_crossref_type( + directivename="command", + rolename="command", + indextemplate="pair: %s; command", + ) + app.add_crossref_type( + directivename="reqmeta", + rolename="reqmeta", + indextemplate="pair: %s; reqmeta", + ) diff --git a/docs/changes.rst b/docs/changes.rst new file mode 100644 index 00000000..d9e113ec --- /dev/null +++ b/docs/changes.rst @@ -0,0 +1 @@ +.. include:: ../CHANGES.rst diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 00000000..c91fd871 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,58 @@ +import sys +from pathlib import Path + +project = "scrapy-zyte-api" +copyright = "2023, Zyte Group Ltd" +author = "Zyte Group Ltd" +release = "0.12.2" + +sys.path.insert(0, str(Path(__file__).parent.absolute())) # _ext +extensions = [ + "_ext", + "sphinx.ext.autodoc", + "sphinx.ext.intersphinx", + "sphinx.ext.viewcode", +] + +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] + +html_theme = "sphinx_rtd_theme" + +intersphinx_mapping = { + "python": ( + "https://docs.python.org/3", + None, + ), + "python-zyte-api": ( + "https://python-zyte-api.readthedocs.io/en/stable", + None, + ), + "scrapy": ( + "https://docs.scrapy.org/en/latest", + None, + ), + "scrapy-poet": ( + "https://scrapy-poet.readthedocs.io/en/stable", + None, + ), + "tenacity": ( + "https://tenacity.readthedocs.io/en/latest", + None, + ), + "w3lib": ( + "https://w3lib.readthedocs.io/en/latest", + None, + ), + "web-poet": ( + "https://web-poet.readthedocs.io/en/stable", + None, + ), + "zyte": ( + "https://docs.zyte.com", + None, + ), + "zyte-common-items": ( + "https://zyte-common-items.readthedocs.io/en/latest", + None, + ), +} diff --git a/docs/first-steps/scrapy-poet.rst b/docs/first-steps/scrapy-poet.rst new file mode 100644 index 00000000..2b4c6db7 --- /dev/null +++ b/docs/first-steps/scrapy-poet.rst @@ -0,0 +1,25 @@ +.. _scrapy-poet-setup: + +================= +scrapy-poet setup +================= + +For :ref:`scrapy-poet integration `: + +- Install or reinstall ``scrapy-zyte-api`` with the ``provider`` extra to + install additional required dependencies: + + .. code-block:: shell + + pip install scrapy-zyte-api[provider] + +- Add the following provider to the ``SCRAPY_POET_PROVIDERS`` setting: + + .. code-block:: python + + SCRAPY_POET_PROVIDERS = { + "scrapy_zyte_api.providers.ZyteApiProvider": 1100, + } + +You can now :ref:`use scrapy-poet ` to get data from Zyte API in +page objects. diff --git a/docs/first-steps/setup.rst b/docs/first-steps/setup.rst new file mode 100644 index 00000000..33e9f7c4 --- /dev/null +++ b/docs/first-steps/setup.rst @@ -0,0 +1,95 @@ +.. _setup: + +============= +Initial setup +============= + +Learn how to get scrapy-zyte-api installed and configured on an existing +:doc:`Scrapy ` project. + +.. tip:: :ref:`Zyte’s web scraping tutorial ` covers + scrapy-zyte-api setup as well. + +Requirements +============ + +You need at least: + +- A :ref:`Zyte API ` subscription (there’s a :ref:`free trial + `). + +- Python 3.7+ + +- Scrapy 2.0.1+ + +:doc:`scrapy-poet ` integration requires higher versions: + +- Python 3.8+ + +- Scrapy 2.6+ + + +Installation +============ + +.. code-block:: shell + + pip install scrapy-zyte-api + + +Configuration +============= + +Add your `Zyte API key`_, and add it to your project ``settings.py``: + +.. _Zyte API key: https://app.zyte.com/o/zyte-api/api-access + +.. code-block:: python + + ZYTE_API_KEY = "YOUR_API_KEY" + +Alternatively, you can set your API key in the ``ZYTE_API_KEY`` environment +variable instead. + +Then, set up scrapy-zyte-api integration in ``settings.py``: + +.. code-block:: python + + DOWNLOAD_HANDLERS = { + "http": "scrapy_zyte_api.ScrapyZyteAPIDownloadHandler", + "https": "scrapy_zyte_api.ScrapyZyteAPIDownloadHandler", + } + DOWNLOADER_MIDDLEWARES = { + "scrapy_zyte_api.ScrapyZyteAPIDownloaderMiddleware": 1000, + } + REQUEST_FINGERPRINTER_CLASS = "scrapy_zyte_api.ScrapyZyteAPIRequestFingerprinter" + TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" + +By default, scrapy-zyte-api doesn't change the spider behavior. To switch your +spider to use Zyte API for all requests, set the following setting as well: + +.. code-block:: python + + ZYTE_API_TRANSPARENT_MODE = True + +If you already had a custom value for :setting:`REQUEST_FINGERPRINTER_CLASS +`, set that value on +:ref:`ZYTE_API_FALLBACK_REQUEST_FINGERPRINTER_CLASS` instead. + +If you had a different value for :setting:`TWISTED_REACTOR +` or no value at all, you will be changing the Twisted +reactor that your Scrapy project uses, and your existing code may need changes, +such as: + +- :ref:`asyncio-preinstalled-reactor`. + + Some Twisted imports install the default, non-asyncio Twisted + reactor as a side effect. Once a reactor is installed, it cannot be + changed for the whole run time. + +- :ref:`asyncio-await-dfd`. + + Note that you might be using Deferreds without realizing it through + some Scrapy functions and methods. For example, when you yield the + return value of ``self.crawler.engine.download()`` from a spider + callback, you are yielding a Deferred. diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 00000000..3834977f --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,49 @@ +============================= +scrapy-zyte-api documentation +============================= + +.. include:: ../README.rst + :start-after: .. description starts + :end-before: .. description ends + +After the :ref:`initial setup `, you can use Zyte API automatically, +either :ref:`globally ` or :ref:`per request `, or +:ref:`manually per request `. + +.. toctree:: + :caption: First steps + :hidden: + + first-steps/setup + first-steps/scrapy-poet + +.. toctree:: + :caption: Usage + :hidden: + + usage/transparent + usage/manual + usage/automap + usage/default + usage/retry + usage/scrapy-poet + usage/stats + usage/fingerprint + usage/proxy + +.. toctree:: + :caption: Reference + :hidden: + + reference/request + reference/response + reference/settings + reference/meta + reference/inputs + reference/fingerprint-params + +.. toctree:: + :caption: All the rest + :hidden: + + changes diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 00000000..954237b9 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "" goto help + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/reference/fingerprint-params.rst b/docs/reference/fingerprint-params.rst new file mode 100644 index 00000000..d6f9c88e --- /dev/null +++ b/docs/reference/fingerprint-params.rst @@ -0,0 +1,36 @@ +.. _fingerprint-params: + +================================= +Request fingerprinting parameters +================================= + +The request fingerprinter class of scrapy-zyte-api generates request +fingerprints for Zyte API requests based on the following Zyte API parameters: + +- :http:`request:url` (:func:`canonicalized `) + + For URLs that include a URL fragment, like ``https://example.com#foo``, URL + canonicalization keeps the URL fragment if :http:`request:browserHtml` or + :http:`request:screenshot` are enabled. + +- Request attribute parameters (:http:`request:httpRequestBody`, + :http:`request:httpRequestMethod`) + +- Output parameters (:http:`request:browserHtml`, + :http:`request:httpResponseBody`, :http:`request:httpResponseHeaders`, + :http:`request:screenshot`) + +- Rendering option parameters (:http:`request:actions`, + :http:`request:javascript`, :http:`request:screenshotOptions`) + +- :http:`request:geolocation` + +The following Zyte API parameters are *not* taken into account for request +fingerprinting: + +- Request header parameters (:http:`request:customHttpRequestHeaders`, + :http:`request:requestHeaders`) + +- Metadata parameters (:http:`request:echoData`, :http:`request:jobId`) + +- Experimental parameters (:http:`request:experimental`) diff --git a/docs/reference/inputs.rst b/docs/reference/inputs.rst new file mode 100644 index 00000000..e5fbe3a4 --- /dev/null +++ b/docs/reference/inputs.rst @@ -0,0 +1,26 @@ +.. _inputs: + +====== +Inputs +====== + +:ref:`scrapy-poet integration `, once :ref:`set up +`, allows obtaining the following :ref:`inputs +` from :doc:`web-poet ` and +:doc:`zyte-common-items ` through Zyte API: + +- :class:`web_poet.BrowserHtml` + +- :class:`web_poet.BrowserResponse` + +- :class:`zyte_common_items.Article` + +- :class:`zyte_common_items.ArticleList` + +- :class:`zyte_common_items.ArticleNavigation` + +- :class:`zyte_common_items.Product` + +- :class:`zyte_common_items.ProductList` + +- :class:`zyte_common_items.ProductNavigation` diff --git a/docs/reference/meta.rst b/docs/reference/meta.rst new file mode 100644 index 00000000..3ba83b57 --- /dev/null +++ b/docs/reference/meta.rst @@ -0,0 +1,60 @@ +.. _meta: + +================= +Request.meta keys +================= + +Keys that can be defined in :attr:`Request.meta ` for +scrapy-zyte-api. + +.. _zyte_api: + +zyte_api +======== + +Default: ``False`` + +See :ref:`manual`. + + +.. _zyte_api_automap: + +zyte_api_automap +================ + +Default: :ref:`ZYTE_API_TRANSPARENT_MODE` (``False``) + +See :ref:`automap`. + + +.. _zyte_api_default_params_meta: + +zyte_api_default_params +======================= + +Default: ``True`` + +If set to ``False``, the values of :ref:`ZYTE_API_AUTOMAP_PARAMS` and +:ref:`ZYTE_API_DEFAULT_PARAMS` are ignored for this request. + + +.. _zyte_api_retry_policy_meta: + +zyte_api_retry_policy +===================== + +Default: :ref:`ZYTE_API_RETRY_POLICY` +(:data:`zyte_api.aio.retry.zyte_api_retrying`) + +Determines the retry policy for Zyte API requests used to fulfill this request. + +It must be a :class:`tenacity.AsyncRetrying` subclass or its import path as a +string. + +.. note:: If you need your request to be serializable, e.g. to use + :class:`~scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware`, you + must specify the import path of your retry policy class as a string, + because `retry policies are not serializable + `_. + +See :ref:`retry`. diff --git a/docs/reference/request.rst b/docs/reference/request.rst new file mode 100644 index 00000000..bd4c9933 --- /dev/null +++ b/docs/reference/request.rst @@ -0,0 +1,183 @@ +.. _request: + +=============== +Request mapping +=============== + +When you enable automatic request parameter mapping, be it through +:ref:`transparent mode ` or :ref:`for a specific request +`, some Zyte API parameters are :ref:`chosen automatically for you +`, and you can then :ref:`change them further +` if you wish. + +.. _request-automatic: + +Automatic mapping +================= + +- :attr:`Request.url ` becomes :http:`request:url`, + same as in :ref:`requests with manual parameters `. + +- If :attr:`Request.method ` is something other + than ``"GET"``, it becomes :http:`request:httpRequestMethod`. + +- :attr:`Request.headers ` become + :http:`request:customHttpRequestHeaders`. + +- :attr:`Request.body ` becomes + :http:`request:httpRequestBody`. + +- If :ref:`ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED` is ``True``, + :setting:`COOKIES_ENABLED ` is ``True`` (default), + and :attr:`Request.meta ` does not set + :reqmeta:`dont_merge_cookies ` to ``True``: + + - :http:`request:experimental.responseCookies` becomes ``True``. + + - Cookies from the :reqmeta:`cookiejar ` become + :http:`request:experimental.requestCookies`. + + All cookies from the cookie jar are set, regardless of their cookie + domain. This is because Zyte API requests may involve requests to + different domains (e.g. when following cross-domain redirects, or + during browser rendering). + + See also: :ref:`ZYTE_API_MAX_COOKIES`, + :ref:`ZYTE_API_COOKIE_MIDDLEWARE`. + +- :http:`request:httpResponseBody` and :http:`request:httpResponseHeaders` + are set to ``True``. + + This is subject to change without prior notice in future versions of + scrapy-zyte-api, so please account for the following: + + - If you are requesting a binary resource, such as a PDF file or an + image file, set :http:`request:httpResponseBody` to ``True`` explicitly + in your requests: + + .. code-block:: python + + Request( + url="https://toscrape.com/img/zyte.png", + meta={ + "zyte_api_automap": {"httpResponseBody": True}, + }, + ) + + In the future, we may stop setting :http:`request:httpResponseBody` to + ``True`` by default, and instead use a different, new Zyte API + parameter that only works for non-binary responses (e.g. HMTL, JSON, + plain text). + + - If you need to access response headers, be it through + :attr:`response.headers ` + or through + :attr:`response.raw_api_response["httpResponseHeaders"] `, + set :http:`request:httpResponseHeaders` to ``True`` explicitly in your + requests: + + .. code-block:: python + + Request( + url="https://toscrape.com/", + meta={ + "zyte_api_automap": {"httpResponseHeaders": True}, + }, + ) + + At the moment scrapy-zyte-api requests response headers because some + response headers are necessary to properly decode the response body as + text. In the future, Zyte API may be able to handle this decoding + automatically, so scrapy-zyte-api would stop setting + :http:`request:httpResponseHeaders` to ``True`` by default. + +For example, the following Scrapy request: + +.. code-block:: python + + Request( + method="POST" + url="https://httpbin.org/anything", + headers={"Content-Type": "application/json"}, + body=b'{"foo": "bar"}', + cookies={"a": "b"}, + ) + +Results in a request to the Zyte API data extraction endpoint with the +following parameters: + +.. code-block:: javascript + + { + "customHttpRequestHeaders": [ + { + "name": "Content-Type", + "value": "application/json" + } + ], + "experimental": { + "requestCookies": [ + { + "name": "a", + "value": "b", + "domain": "" + } + ], + "responseCookies": true + }, + "httpResponseBody": true, + "httpResponseHeaders": true, + "httpRequestBody": "eyJmb28iOiAiYmFyIn0=", + "httpRequestMethod": "POST", + "url": "https://httpbin.org/anything" + } + +Header mapping +============== + +When mapping headers, headers not supported by Zyte API are excluded from the +mapping by default. + +Use :ref:`ZYTE_API_SKIP_HEADERS` and :ref:`ZYTE_API_BROWSER_HEADERS` to change +which headers are included or excluded from header mapping. + + +.. _request-unsupported: + +Unsupported scenarios +===================== + +To maximize support for potential future changes in Zyte API, automatic +request parameter mapping allows some parameter values and parameter +combinations that Zyte API does not currently support, and may never support: + +- :attr:`Request.method ` becomes + :http:`request:httpRequestMethod` even for unsupported + :http:`request:httpRequestMethod` values, and even if + :http:`request:httpResponseBody` is unset. + +- You can set :http:`request:customHttpRequestHeaders` or + :http:`request:requestHeaders` to ``True`` to force their mapping from + :attr:`Request.headers ` in scenarios where + they would not be mapped otherwise. + + Conversely, you can set :http:`request:customHttpRequestHeaders` or + :http:`request:requestHeaders` to ``False`` to prevent their mapping from + :attr:`Request.headers `. + +- :attr:`Request.body ` becomes + :http:`request:httpRequestBody` even if :http:`request:httpResponseBody` is + unset. + +- You can set :http:`request:httpResponseBody` to ``False`` (which unsets the + parameter), and not set :http:`request:browserHtml` or + :http:`request:screenshot` to ``True``. In this case, + :attr:`Request.headers ` is mapped as + :http:`request:requestHeaders`. + +- You can set :http:`request:httpResponseBody` to ``True`` and also set + :http:`request:browserHtml` or :http:`request:screenshot` to ``True``. In + this case, :attr:`Request.headers ` is mapped + both as :http:`request:customHttpRequestHeaders` and as + :http:`request:requestHeaders`, and :http:`request:browserHtml` is used as + :class:`response.body `. diff --git a/docs/reference/response.rst b/docs/reference/response.rst new file mode 100644 index 00000000..ce4d78b4 --- /dev/null +++ b/docs/reference/response.rst @@ -0,0 +1,151 @@ +.. _response: + +================ +Response mapping +================ + +.. _response-parameters: + +Parameters +========== + +Zyte API response parameters are mapped into :ref:`response class +` attributes where possible: + +- :http:`response:url` becomes :class:`response.url + `. + +- :http:`response:statusCode` becomes :class:`response.status + `. + +- :http:`response:httpResponseHeaders` and + :http:`response:experimental.responseCookies` become + :class:`response.headers + `. + +- :http:`response:experimental.responseCookies` is also mapped into the + request :reqmeta:`cookiejar `. + +- :http:`response:browserHtml` and :http:`response:httpResponseBody` are + mapped into both + :class:`response.text ` + and + :class:`response.body `. + + If none of these parameters were present, e.g. if the only requested output + was :http:`response:screenshot`, + :class:`response.text ` + and + :class:`response.body ` + would be empty. + + If a future version of Zyte API supported requesting both outputs on the + same request, and both parameters were present, + :http:`response:browserHtml` would be the one mapped into + :class:`response.text ` + and + :class:`response.body `. + +Both :ref:`response classes ` have a +:class:`response.raw_api_response ` +attribute that contains a :class:`dict` with the complete, raw response from +Zyte API, where you can find all Zyte API response parameters, including those +that are not mapped into other response class attributes. + +For example, for a request for :http:`response:httpResponseBody` and +:http:`response:httpResponseHeaders`, you would get: + +.. code-block:: python + + def parse(self, response): + print(response.url) + # "https://quotes.toscrape.com/" + print(response.status) + # 200 + print(response.headers) + # {b"Content-Type": [b"text/html"], …} + print(response.text) + # "…" + print(response.body) + # b"…" + print(response.raw_api_response) + # { + # "url": "https://quotes.toscrape.com/", + # "statusCode": 200, + # "httpResponseBody": "PGh0bWw+4oCmPC9odG1sPg==", + # "httpResponseHeaders": […], + # } + +For a request for :http:`response:screenshot`, on the other hand, the response +would look as follows: + +.. code-block:: python + + def parse(self, response): + print(response.url) + # "https://quotes.toscrape.com/" + print(response.status) + # 200 + print(response.headers) + # {} + print(response.text) + # "" + print(response.body) + # b"" + print(response.raw_api_response) + # { + # "url": "https://quotes.toscrape.com/", + # "statusCode": 200, + # "screenshot": "iVBORw0KGgoAAAANSUh…", + # } + from base64 import b64decode + print(b64decode(response.raw_api_response["screenshot"])) + # b'\x89PNG\r\n\x1a\n\x00\x00\x00\r…' + + +.. _response-classes: + +Classes +======= + +Zyte API responses are mapped with one of the following classes: + +- :class:`~scrapy_zyte_api.responses.ZyteAPITextResponse` is used to map text + responses, i.e. responses with :http:`response:browserHtml` or responses + with both :http:`response:httpResponseBody` and + :http:`response:httpResponseHeaders` with a text body (e.g. plain text, + HTML, JSON). + +- :class:`~scrapy_zyte_api.responses.ZyteAPIResponse` is used to map any + other response. + +.. autoclass:: scrapy_zyte_api.responses.ZyteAPIResponse + :show-inheritance: + + .. autoattribute:: url + + .. autoattribute:: status + + .. autoattribute:: headers + + .. attribute:: body + :type: bytes + + .. autoattribute:: raw_api_response + +.. autoclass:: scrapy_zyte_api.responses.ZyteAPITextResponse + :show-inheritance: + + .. autoattribute:: url + + .. autoattribute:: status + + .. autoattribute:: headers + + .. attribute:: body + :type: bytes + + .. attribute:: text + :type: str + + .. autoattribute:: raw_api_response diff --git a/docs/reference/settings.rst b/docs/reference/settings.rst new file mode 100644 index 00000000..e24c2a16 --- /dev/null +++ b/docs/reference/settings.rst @@ -0,0 +1,266 @@ +.. _settings: + +======== +Settings +======== + +:ref:`Settings ` for scrapy-zyte-api. + +.. _ZYTE_API_AUTOMAP_PARAMS: + +ZYTE_API_AUTOMAP_PARAMS +======================= + +Default: ``{}`` + +:class:`dict` of parameters to be combined with :ref:`automatic request +parameters `. + +These parameters are merged with :ref:`zyte_api_automap` parameters. +:ref:`zyte_api_automap` parameters take precedence. + +This setting has no effect on requests with :ref:`manual request parameters +`. + +When using :ref:`transparent mode `, be careful of which +parameters you define in this setting. In transparent mode, all Scrapy requests +go through Zyte API, even requests that Scrapy sends automatically, such as +those for ``robots.txt`` files when :setting:`ROBOTSTXT_OBEY +` is ``True``, or those for sitemaps when using +:class:`~scrapy.spiders.SitemapSpider`. Certain parameters, like +:http:`request:browserHtml` or :http:`request:screenshot`, are not meant to be +used for every single request. + +If :ref:`zyte_api_default_params ` in +:attr:`Request.meta ` is set to ``False``, this +setting is ignored for that request. + +See :ref:`default`. + + +.. _ZYTE_API_BROWSER_HEADERS: + +ZYTE_API_BROWSER_HEADERS +======================== + +Default: ``{"Referer": "referer"}`` + +Determines headers that *can* be mapped as :http:`request:requestHeaders`. + +It is a :class:`dict`, where keys are header names and values are the key that +represents them in :http:`request:requestHeaders`. + + +.. _ZYTE_API_COOKIE_MIDDLEWARE: + +ZYTE_API_COOKIE_MIDDLEWARE +========================== + +Default: :class:`scrapy.downloadermiddlewares.cookies.CookiesMiddleware` + +If you are using a custom downloader middleware to handle request cookie jars, +you can point this setting to its import path to make scrapy-zyte-api work with +it. + +Your cookie downloader middleware must have a ``jars`` property with the same +signature as in the built-in Scrapy downloader middleware for cookie handling. + + +.. _ZYTE_API_DEFAULT_PARAMS: + +ZYTE_API_DEFAULT_PARAMS +======================= + +Default: ``{}`` + +:class:`dict` of parameters to be combined with :ref:`manual request parameters +`. + +You may set :ref:`zyte_api` to an empty :class:`dict` to only use the +parameters defined here for that request. + +These parameters are merged with :ref:`zyte_api` parameters. :ref:`zyte_api` +parameters take precedence. + +This setting has no effect on requests with :ref:`automatic request parameters +`. + +If :ref:`zyte_api_default_params ` in +:attr:`Request.meta ` is set to ``False``, this +setting is ignored for that request. + +See :ref:`default`. + + +.. _ZYTE_API_ENABLED: + +ZYTE_API_ENABLED +================ + +Default: ``True`` + +Can be set to ``False`` to disable scrapy-zyte-api. + + +.. _ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED: + +ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED +===================================== + +Default: ``False`` + +See :ref:`request-automatic`. + + +.. _ZYTE_API_FALLBACK_REQUEST_FINGERPRINTER_CLASS: + +ZYTE_API_FALLBACK_REQUEST_FINGERPRINTER_CLASS +============================================= + +Default: :class:`scrapy.utils.request.RequestFingerprinter` + +:ref:`Request fingerprinter ` to for requests that do not +go through Zyte API. See :ref:`fingerprint`. + + +.. _ZYTE_API_KEY: + +ZYTE_API_KEY +============ + +Default: ``None`` + +Your `Zyte API key`_. + +.. _Zyte API key: https://app.zyte.com/o/zyte-api/api-access + +You can alternatively define an environment variable with the same name. + +.. tip:: On :ref:`Scrapy Cloud `, this setting is defined + automatically. + + +.. _ZYTE_API_LOG_REQUESTS: + +ZYTE_API_LOG_REQUESTS +===================== + +Default: ``False`` + +Set this to ``True`` and :setting:`LOG_LEVEL ` to ``"DEBUG"`` +to enable the logging of debug messages that indicate the JSON object sent on +every Zyte API request. + +For example:: + + Sending Zyte API extract request: {"url": "https://example.com", "httpResponseBody": true} + +See also: :ref:`ZYTE_API_LOG_REQUESTS_TRUNCATE`. + + +.. _ZYTE_API_LOG_REQUESTS_TRUNCATE: + +ZYTE_API_LOG_REQUESTS_TRUNCATE +============================== + +Default: ``64`` + +Determines the maximum length of any string value in the JSON object logged +when :ref:`ZYTE_API_LOG_REQUESTS` is enabled, excluding object keys. + +To disable truncation, set this to ``0``. + + +.. _ZYTE_API_MAX_COOKIES: + +ZYTE_API_MAX_COOKIES +==================== + +Default: ``100`` + +If the cookies to be set during :ref:`request mapping ` exceed this +limit, a warning is logged, and only as many cookies as the limit allows are +set for the target request. + +To silence this warning, set :http:`request:experimental.requestCookies` +manually, e.g. to an empty :class:`dict`. + +Alternatively, if :http:`request:experimental.requestCookies` starts supporting +more than 100 cookies, update this setting accordingly. + + +.. _ZYTE_API_MAX_REQUESTS: + +ZYTE_API_MAX_REQUESTS +===================== + +Default: ``None`` + +When set to an integer value > 0, the spider will close when the number of Zyte +API requests reaches it. + +Note that requests with error responses that cannot be retried or exceed their +retry limit also count here. + + +.. _ZYTE_API_PROVIDER_PARAMS: + +ZYTE_API_PROVIDER_PARAMS +======================== + +Default: ``{}`` + +Defines additional request parameters to use in Zyte API requests sent by the +:ref:`scrapy-poet integration `. + + +.. _ZYTE_API_RETRY_POLICY: + +ZYTE_API_RETRY_POLICY +===================== + +Default: ``"zyte_api.aio.retry.zyte_api_retrying"`` + +Determines the retry policy for Zyte API requests. + +It must be a string with the import path of a :class:`tenacity.AsyncRetrying` +subclass. + +.. note:: :ref:`Settings ` must be :mod:`picklable `, + and `retry policies are not `_, + so you cannot assign a retry policy class directly to this setting, you + must use their import path as a string instead. + +See :ref:`retry`. + + +.. _ZYTE_API_SKIP_HEADERS: + +ZYTE_API_SKIP_HEADERS +===================== + +Default: ``["User-Agent"]`` + +Determines headers that must *not* be mapped as +:http:`request:customHttpRequestHeaders`. + + +.. _ZYTE_API_TRANSPARENT_MODE: + +ZYTE_API_TRANSPARENT_MODE +========================= + +Default: ``False`` + +See :ref:`transparent`. + + +.. _ZYTE_API_USE_ENV_PROXY: + +ZYTE_API_USE_ENV_PROXY +====================== + +Default: ``False`` + +Set to ``True`` to make Zyte API requests respect system proxy settings. See +:ref:`proxy`. diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 00000000..c71ccc8d --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,2 @@ +Sphinx==7.2.6 +sphinx-rtd-theme==1.3.0 diff --git a/docs/usage/automap.rst b/docs/usage/automap.rst new file mode 100644 index 00000000..2f81a2fb --- /dev/null +++ b/docs/usage/automap.rst @@ -0,0 +1,80 @@ +.. _automap: + +============================ +Automatic request parameters +============================ + +To send a Scrapy request through Zyte API letting Zyte API request parameters +be automatically chosen based on the parameters of that Scrapy request, set the +:ref:`zyte_api_automap` key in :attr:`Request.meta ` +to ``True``. + +For example: + +.. code-block:: python + + import scrapy + + + class SampleQuotesSpider(scrapy.Spider): + name = "sample_quotes" + + def start_requests(self): + yield scrapy.Request( + url="https://quotes.toscrape.com/", + meta={ + "zyte_api_automap": True, + }, + ) + + def parse(self, response): + print(response.text) + # "…" + +In :ref:`transparent mode `, :ref:`zyte_api_automap` is ``True`` +by default. + +See :ref:`request` to learn how exactly request parameters are mapped when +using automatic request parameters. + + +.. _request-change: + +Changing parameters +=================== + +You may set :ref:`zyte_api_automap` in :attr:`Request.meta +` to a :class:`dict` of Zyte API parameters to add, +modify, or remove (by setting to ``False``) automatic request parameters. This +also works in :ref:`transparent mode `. + +Enabling :http:`request:browserHtml`, :http:`request:screenshot`, or an +automatic extraction property, unsets :http:`request:httpResponseBody` and +:http:`request:httpResponseHeaders`, and makes ``Request.headers`` become +:http:`request:requestHeaders` instead of +:http:`request:customHttpRequestHeaders`. For example, the following Scrapy +request: + +.. code-block:: python + + Request( + url="https://quotes.toscrape.com", + headers={"Referer": "https://example.com/"}, + meta={"zyte_api_automap": {"browserHtml": True}}, + ) + +Results in a request to the Zyte API data extraction endpoint with the +following parameters: + +.. code-block:: javascript + + { + "browserHtml": true, + "experimental": { + "responseCookies": true + }, + "requestHeaders": {"referer": "https://example.com/"}, + "url": "https://quotes.toscrape.com" + } + +See also: :ref:`request-unsupported`. \ No newline at end of file diff --git a/docs/usage/default.rst b/docs/usage/default.rst new file mode 100644 index 00000000..582e3617 --- /dev/null +++ b/docs/usage/default.rst @@ -0,0 +1,22 @@ +.. _default: + +================== +Default parameters +================== + +Often the same configuration needs to be used for all Zyte API requests. For +example, all requests may need to set the same :http:`request:geolocation`, or +the spider only uses :http:`request:browserHtml` requests. + +The following settings allow you to define Zyte API parameters to be included +in all requests: + +- :ref:`ZYTE_API_AUTOMAP_PARAMS`, for :ref:`transparent mode ` + and :ref:`automatic request parameters `. + +- :ref:`ZYTE_API_DEFAULT_PARAMS`, for :ref:`manual request parameters + `. + +For example, if you set :ref:`ZYTE_API_DEFAULT_PARAMS` to +``{"geolocation": "US"}`` and :ref:`zyte_api` to ``{"browserHtml": True}``, +``{"url: "…", "geolocation": "US", "browserHtml": True}`` is sent to Zyte API. diff --git a/docs/usage/fingerprint.rst b/docs/usage/fingerprint.rst new file mode 100644 index 00000000..0a19ccd3 --- /dev/null +++ b/docs/usage/fingerprint.rst @@ -0,0 +1,57 @@ +.. _fingerprint: + +Request fingerprinting +====================== + +The request fingerprinter class of scrapy-zyte-api ensures that Scrapy 2.7 and +later generate unique :ref:`request fingerprints ` for +Zyte API requests :ref:`based on some of their parameters +`. + +For example, a request for :http:`request:browserHtml` and a request for +:http:`request:screenshot` with the same target URL are considered different +requests. Similarly, requests with the same target URL but different +:http:`request:actions` are also considered different requests. + +Use :ref:`ZYTE_API_FALLBACK_REQUEST_FINGERPRINTER_CLASS` to define a custom +request fingerprinting for requests that do not go through Zyte API. + + +Request fingerprinting before Scrapy 2.7 +---------------------------------------- + +If you have a Scrapy version older than Scrapy 2.7, Zyte API parameters are not +taken into account for request fingerprinting. This can cause some Scrapy +components, like the filter of duplicate requests or the HTTP cache extension, +to interpret 2 different requests as being the same. + +To avoid most issues, use :ref:`automatic request parameters `, either +through :ref:`transparent mode ` or setting +:ref:`zyte_api_automap` to ``True`` in :attr:`Request.meta +`, and then use :class:`~scrapy.http.Request` +attributes instead of :attr:`Request.meta ` as much +as possible. Unlike :attr:`Request.meta `, +:class:`~scrapy.http.Request` attributes do affect request fingerprints in +Scrapy versions older than Scrapy 2.7. + +For requests that must have the same :class:`~scrapy.http.Request` attributes +but should still be considered different, such as browser-based requests with +different URL fragments, you can set ``dont_filter=True`` when creating your +request to prevent the duplicate filter of Scrapy to filter any of them out. +For example: + +.. code-block:: python + + yield Request( + "https://toscrape.com#1", + meta={"zyte_api_automap": {"browserHtml": True}}, + dont_filter=True, + ) + yield Request( + "https://toscrape.com#2", + meta={"zyte_api_automap": {"browserHtml": True}}, + dont_filter=True, + ) + +Note, however, that for other Scrapy components, like the HTTP cache +extensions, these 2 requests would still be considered identical. diff --git a/docs/usage/manual.rst b/docs/usage/manual.rst new file mode 100644 index 00000000..525c530c --- /dev/null +++ b/docs/usage/manual.rst @@ -0,0 +1,67 @@ +.. _manual: + +========================= +Manual request parameters +========================= + +To send a Scrapy request through Zyte API with manually-defined Zyte API +request parameters, define your parameters in the :ref:`zyte_api` key in +:attr:`Request.meta ` as a :class:`dict`. + +The only exception is the :http:`request:url` parameter, which should not be +defined as a Zyte API parameter. The value from :attr:`Request.url +` is used automatically. + +For example: + +.. code-block:: python + + import scrapy + + + class SampleQuotesSpider(scrapy.Spider): + name = "sample_quotes" + + def start_requests(self): + yield scrapy.Request( + url="https://quotes.toscrape.com/", + meta={ + "zyte_api": { + "browserHtml": True, + } + }, + ) + + def parse(self, response): + print(response.text) + # "…" + +Note that response headers are necessary for raw response decoding. When +defining parameters manually and requesting :http:`request:httpResponseBody`, +remember to also request :http:`request:httpResponseHeaders`: + +.. code-block:: python + + import scrapy + + + class SampleQuotesSpider(scrapy.Spider): + name = "sample_quotes" + + def start_requests(self): + yield scrapy.Request( + url="https://quotes.toscrape.com/", + meta={ + "zyte_api": { + "httpResponseBody": True, + "httpResponseHeaders": True, + } + }, + ) + + def parse(self, response): + print(response.text) + # "…" + +To learn more about Zyte API parameters, see the upstream :ref:`usage +` and :ref:`API reference ` pages. diff --git a/docs/usage/proxy.rst b/docs/usage/proxy.rst new file mode 100644 index 00000000..a14731a9 --- /dev/null +++ b/docs/usage/proxy.rst @@ -0,0 +1,9 @@ +.. _proxy: + +============= +Using a proxy +============= + +If you need a proxy to access Zyte API (e.g. a corporate proxy), configure +the ``HTTP_PROXY`` and ``HTTPS_PROXY`` environment variables accordingly, and +set the :ref:`ZYTE_API_USE_ENV_PROXY` setting to ``True``. diff --git a/docs/usage/retry.rst b/docs/usage/retry.rst new file mode 100644 index 00000000..f497c82f --- /dev/null +++ b/docs/usage/retry.rst @@ -0,0 +1,69 @@ +.. _retry: + +Retries +======= + +API requests are retried automatically using the default retry policy of +:doc:`python-zyte-api `. + +API requests that exceed retries are dropped. You cannot manage API request +retries through :ref:`downloader middlewares `. + +Use the :ref:`ZYTE_API_RETRY_POLICY` setting or the :ref:`zyte_api_retry_policy +` :attr:`Request.meta ` +key to override the default retry policy with a custom retry policy. + +For example, to increase the maximum number of retries to 10 before dropping +the API request, you can subclass :class:`~zyte_api.aio.retry.RetryFactory` as +follows: + +.. code-block:: python + + # project/retry_policies.py + from tenacity import stop_after_attempt + from zyte_api.aio.retry import RetryFactory + + class CustomRetryFactory(RetryFactory): + temporary_download_error_stop = stop_after_attempt(10) + + CUSTOM_RETRY_POLICY = CustomRetryFactory().build() + + # project/settings.py + ZYTE_API_RETRY_POLICY = "project.retry_policies.CUSTOM_RETRY_POLICY" + + +To extend this retry policy, so it will also retry HTTP 521 errors, the same +as HTTP 520 errors, you can implement: + +.. code-block:: python + + # project/retry_policies.py + from tenacity import retry_if_exception, RetryCallState, stop_after_attempt + from zyte_api.aio.errors import RequestError + from zyte_api.aio.retry import RetryFactory + + def is_http_521(exc: BaseException) -> bool: + return isinstance(exc, RequestError) and exc.status == 521 + + class CustomRetryFactory(RetryFactory): + + retry_condition = ( + RetryFactory.retry_condition + | retry_if_exception(is_http_521) + ) + temporary_download_error_stop = stop_after_attempt(10) + + def wait(self, retry_state: RetryCallState) -> float: + if is_http_521(retry_state.outcome.exception()): + return self.temporary_download_error_wait(retry_state=retry_state) + return super().wait(retry_state) + + def stop(self, retry_state: RetryCallState) -> bool: + if is_http_521(retry_state.outcome.exception()): + return self.temporary_download_error_stop(retry_state) + return super().stop(retry_state) + + CUSTOM_RETRY_POLICY = CustomRetryFactory().build() + + # project/settings.py + ZYTE_API_RETRY_POLICY = "project.retry_policies.CUSTOM_RETRY_POLICY" diff --git a/docs/usage/scrapy-poet.rst b/docs/usage/scrapy-poet.rst new file mode 100644 index 00000000..a043b7ea --- /dev/null +++ b/docs/usage/scrapy-poet.rst @@ -0,0 +1,67 @@ +.. _scrapy-poet: + +======================= +scrapy-poet integration +======================= + +After you :ref:`set up scrapy-poet integration `, you can +request :ref:`supported page inputs ` in your page objects:: + + @attrs.define + class ProductPage(BasePage): + response: BrowserResponse + product: Product + + + class ZyteApiSpider(scrapy.Spider): + ... + + def parse_page(self, response: DummyResponse, page: ProductPage): + ... + +Or request them directly in the callback:: + + class ZyteApiSpider(scrapy.Spider): + ... + + def parse_page(self, + response: DummyResponse, + browser_response: BrowserResponse, + product: Product, + ): + ... + +Default parameters +================== + +scrapy-poet integration ignores :ref:`default parameters `. + +To add extra parameters to all Zyte API requests sent by the provider, set them +as a dictionary through the :ref:`ZYTE_API_PROVIDER_PARAMS` setting, for +example in ``settings.py``:: + + ZYTE_API_PROVIDER_PARAMS = {"geolocation": "IE"} + +When :ref:`ZYTE_API_PROVIDER_PARAMS` includes one of the Zyte API extraction +options (e.g. ``productOptions`` for ``product``), but the final Zyte API +request doesn't include the corresponding data type, the unused options are +automatically removed. So, it's safe to use :ref:`ZYTE_API_PROVIDER_PARAMS` to +set the default options for various extraction types, e.g.:: + + ZYTE_API_PROVIDER_PARAMS = { + "productOptions": {"extractFrom": "httpResponseBody"}, + "productNavigationOptions": {"extractFrom": "httpResponseBody"}, + } + +Note that the built-in ``scrapy_poet.page_input_providers.ItemProvider`` has a +priority of 2000, so when you have page objects producing +:class:`zyte_common_items.Product` items you should use higher values for +``ZyteApiProvider`` if you want these items to come from these page objects, +and lower values if you want them to come from Zyte API. + +Currently, when ``ItemProvider`` is used together with ``ZyteApiProvider``, +it may make more requests than is optimal: the normal Scrapy response will be +always requested even when using a :class:`~scrapy_poet.DummyResponse` +annotation, and in some dependency combinations two Zyte API requests will be +made for the same page. We are planning to solve these problems in the future +releases of :doc:`scrapy-poet ` and scrapy-zyte-api. diff --git a/docs/usage/stats.rst b/docs/usage/stats.rst new file mode 100644 index 00000000..392838e4 --- /dev/null +++ b/docs/usage/stats.rst @@ -0,0 +1,19 @@ +.. _stats: + +===== +Stats +===== + +Stats from :doc:`python-zyte-api ` are exposed as +:ref:`Scrapy stats ` with the ``scrapy-zyte-api`` prefix. + +For example, ``scrapy-zyte-api/status_codes/`` stats indicate the +status code of Zyte API responses (e.g. ``429`` for :ref:`rate limiting +` or ``520`` for :ref:`temporary download errors +`). + +.. note:: The actual status code that is received from the target website, i.e. + the :http:`response:statusCode` response field of a :ref:`Zyte API + successful response `, is accounted for in + the ``downloader/response_status_count/`` stat, as with any + other Scrapy response. diff --git a/docs/usage/transparent.rst b/docs/usage/transparent.rst new file mode 100644 index 00000000..354a710c --- /dev/null +++ b/docs/usage/transparent.rst @@ -0,0 +1,35 @@ +.. _transparent: + +================ +Transparent mode +================ + +Set :ref:`ZYTE_API_TRANSPARENT_MODE` to ``True`` to handle requests as follows: + +- By default, requests are sent with :ref:`automatic request + parameters `. + +- Requests with :ref:`zyte_api` set to a ``dict`` are sent with :ref:`manual + request parameters `. + +- Requests with :ref:`zyte_api_automap` set to ``False`` are *not* sent + through Zyte API. + +For example: + +.. code-block:: python + + import scrapy + + + class SampleQuotesSpider(scrapy.Spider): + name = "sample_quotes" + start_urls = ["https://quotes.toscrape.com/"] + + custom_settings = { + "ZYTE_API_TRANSPARENT_MODE": True, + } + + def parse(self, response): + print(response.text) + # "…" diff --git a/scrapy_zyte_api/_params.py b/scrapy_zyte_api/_params.py index 00e06351..16b1b86d 100644 --- a/scrapy_zyte_api/_params.py +++ b/scrapy_zyte_api/_params.py @@ -649,7 +649,7 @@ def _handle_warn_on_cookies(self, request, params): "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED is False, so automatic " "mapping will not map cookies for this or any other request. " "To silence this warning, disable cookies for all requests " - "that use automated mapping, either with the " + "that use automatic mapping, either with the " "COOKIES_ENABLED setting or with the dont_merge_cookies " "request metadata key." ), diff --git a/scrapy_zyte_api/responses.py b/scrapy_zyte_api/responses.py index 18b63401..7e223131 100644 --- a/scrapy_zyte_api/responses.py +++ b/scrapy_zyte_api/responses.py @@ -56,8 +56,7 @@ def replace(self, *args, **kwargs): def raw_api_response(self) -> Optional[Dict]: """Contains the raw API response from Zyte API. - To see the full list of parameters and their description, kindly refer to the - `Zyte API Specification `_. + For the full list of parameters, see :ref:`zyte-api-http-api`. """ return self._raw_api_response diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index 4d861f36..0227cbc0 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -700,7 +700,7 @@ def _test_automap( [], ), # httpResponseBody can be explicitly requested in meta, and should be - # in cases where a binary response is expected, since automated mapping + # in cases where a binary response is expected, since automatic mapping # may stop working for binary responses in the future. ( {"httpResponseBody": True}, @@ -1948,7 +1948,7 @@ def test_automap_header_settings(settings, headers, meta, expected, warnings, ca ), # Setting requestCookies to [] disables automatic mapping, but logs a # a warning recommending to either use False to achieve the same or - # remove the parameter to let automated mapping work. + # remove the parameter to let automatic mapping work. ( { "ZYTE_API_EXPERIMENTAL_COOKIES_ENABLED": True, diff --git a/tox.ini b/tox.ini index 68107d3a..eb006d0e 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py37,py38,py39,py310,py311,mypy,linters,twine-check +envlist = py37,py38,py39,py310,py311,mypy,linters,twine-check,docs [testenv] deps = @@ -109,3 +109,10 @@ deps = commands = python setup.py sdist twine check dist/* + +[testenv:docs] +changedir = docs +deps = + -rdocs/requirements.txt +commands = + sphinx-build -W -b html . {envtmpdir}/html