Skip to content

Commit

Permalink
Implement a parameter map (#151)
Browse files Browse the repository at this point in the history
  • Loading branch information
Gallaecio authored Nov 30, 2023
1 parent dc09ac3 commit 702cc63
Show file tree
Hide file tree
Showing 5 changed files with 383 additions and 72 deletions.
31 changes: 25 additions & 6 deletions docs/reference/fingerprint-params.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,26 +11,45 @@ fingerprints for Zyte API requests based on the following Zyte API parameters:

For URLs that include a URL fragment, like ``https://example.com#foo``, URL
canonicalization keeps the URL fragment if :http:`request:browserHtml` or
:http:`request:screenshot` are enabled.
:http:`request:screenshot` are enabled, or if extractFrom_ is set to
``browserHtml``.

.. _extractFrom: https://docs.zyte.com/zyte-api/usage/extract.html#extraction-source

- Request attribute parameters (:http:`request:httpRequestBody`,
:http:`request:httpRequestMethod`)
:http:`request:httpRequestText`, :http:`request:httpRequestMethod`), except
headers

Equivalent :http:`request:httpRequestBody` and
:http:`request:httpRequestText` values generate the same signature.

- Output parameters (:http:`request:browserHtml`,
:http:`request:httpResponseBody`, :http:`request:httpResponseHeaders`,
:http:`request:screenshot`)
:http:`request:responseCookies`, :http:`request:screenshot`, and
:ref:`automatic extraction outputs <zyte-api-extract-fields>` like
:http:`request:product`)

- Rendering option parameters (:http:`request:actions`,
:http:`request:javascript`, :http:`request:screenshotOptions`)
:http:`request:device`, :http:`request:javascript`,
:http:`request:screenshotOptions`, :http:`request:viewport`, and automatic
extraction options like :http:`request:productOptions`)

- :http:`request:geolocation`

- :http:`request:echoData`

The following Zyte API parameters are *not* taken into account for request
fingerprinting:

- Request header parameters (:http:`request:customHttpRequestHeaders`,
:http:`request:requestHeaders`)

- Metadata parameters (:http:`request:echoData`, :http:`request:jobId`)
- Request cookie parameters (:http:`request:cookieManagement`,
:http:`request:requestCookies`)

- Session handling parameters (:http:`request:sessionContext`,
:http:`request:sessionContextParameters`)

- :http:`request:jobId`

- Experimental parameters (:http:`request:experimental`)
- Experimental parameters (:http:`experimental.* <request:experimental>`)
280 changes: 269 additions & 11 deletions scrapy_zyte_api/_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,22 +15,274 @@

logger = getLogger(__name__)

_NoDefault = object()

# Map of all known root Zyte API request params and how they need to be
# handled. Sorted by appearance in
# https://docs.zyte.com/zyte-api/usage/reference.html.
_REQUEST_PARAMS: Dict[str, Dict[str, Any]] = {
"url": {
"default": _NoDefault,
"is_extract_type": False,
"requires_browser_rendering": False,
"changes_fingerprint": True,
},
"requestHeaders": {
"default": {},
"is_extract_type": False,
"requires_browser_rendering": False,
"changes_fingerprint": False,
},
"httpRequestMethod": {
"default": "GET",
"is_extract_type": False,
"requires_browser_rendering": False,
"changes_fingerprint": True,
},
"httpRequestBody": {
"default": "",
"is_extract_type": False,
"requires_browser_rendering": False,
"changes_fingerprint": True,
},
"httpRequestText": {
"default": "",
"is_extract_type": False,
"requires_browser_rendering": False,
"changes_fingerprint": True,
},
"customHttpRequestHeaders": {
"default": [],
"is_extract_type": False,
"requires_browser_rendering": False,
"changes_fingerprint": False,
},
"httpResponseBody": {
"default": False,
"is_extract_type": False,
"requires_browser_rendering": False,
"changes_fingerprint": True,
},
"httpResponseHeaders": {
"default": False,
"is_extract_type": False,
"requires_browser_rendering": False,
"changes_fingerprint": True,
},
"browserHtml": {
"default": False,
"is_extract_type": False,
"requires_browser_rendering": True,
"changes_fingerprint": True,
},
"screenshot": {
"default": False,
"is_extract_type": False,
"requires_browser_rendering": True,
"changes_fingerprint": True,
},
"screenshotOptions": {
"default": {},
"is_extract_type": False,
"requires_browser_rendering": False, # Not on its own.
"changes_fingerprint": True,
},
"article": {
"default": False,
"is_extract_type": True,
"requires_browser_rendering": False,
"changes_fingerprint": True,
},
"articleOptions": {
"default": {},
"is_extract_type": False, # Not on its own.
"requires_browser_rendering": False,
"changes_fingerprint": True,
},
"articleList": {
"default": False,
"is_extract_type": True,
"requires_browser_rendering": False,
"changes_fingerprint": True,
},
"articleListOptions": {
"default": {},
"is_extract_type": False, # Not on its own.
"requires_browser_rendering": False,
"changes_fingerprint": True,
},
"articleNavigation": {
"default": False,
"is_extract_type": True,
"requires_browser_rendering": False,
"changes_fingerprint": True,
},
"articleNavigationOptions": {
"default": {},
"is_extract_type": False, # Not on its own.
"requires_browser_rendering": False,
"changes_fingerprint": True,
},
"jobPosting": {
"default": False,
"is_extract_type": True,
"requires_browser_rendering": False,
"changes_fingerprint": True,
},
"jobPostingOptions": {
"default": {},
"is_extract_type": False, # Not on its own.
"requires_browser_rendering": False,
"changes_fingerprint": True,
},
"product": {
"default": False,
"is_extract_type": True,
"requires_browser_rendering": False,
"changes_fingerprint": True,
},
"productOptions": {
"default": {},
"is_extract_type": False, # Not on its own.
"requires_browser_rendering": False,
"changes_fingerprint": True,
},
"productList": {
"default": False,
"is_extract_type": True,
"requires_browser_rendering": False,
"changes_fingerprint": True,
},
"productListOptions": {
"default": {},
"is_extract_type": False, # Not on its own.
"requires_browser_rendering": False,
"changes_fingerprint": True,
},
"productNavigation": {
"default": False,
"is_extract_type": True,
"requires_browser_rendering": False,
"changes_fingerprint": True,
},
"productNavigationOptions": {
"default": {},
"is_extract_type": False, # Not on its own.
"requires_browser_rendering": False,
"changes_fingerprint": True,
},
"geolocation": {
"default": None,
"is_extract_type": False,
"requires_browser_rendering": False,
"changes_fingerprint": True,
},
"javascript": {
"default": None,
"is_extract_type": False,
"requires_browser_rendering": False, # Not on its own.
"changes_fingerprint": True,
},
"actions": {
"default": [],
"is_extract_type": False,
"requires_browser_rendering": False, # Not on its own.
"changes_fingerprint": True,
},
"jobId": {
"default": None,
"is_extract_type": False,
"requires_browser_rendering": False,
"changes_fingerprint": False,
},
"echoData": {
"default": None,
"is_extract_type": False,
"requires_browser_rendering": False,
"changes_fingerprint": True,
},
"viewport": {
"default": {},
"is_extract_type": False,
"requires_browser_rendering": False,
"changes_fingerprint": True,
},
"sessionContext": {
"default": [],
"is_extract_type": False,
"requires_browser_rendering": False,
"changes_fingerprint": False, # Treated like headers.
},
"sessionContextParameters": {
"default": {},
"is_extract_type": False,
"requires_browser_rendering": False,
"changes_fingerprint": False, # Treated like sessionContext.
},
"device": {
"default": "auto",
"is_extract_type": False,
"requires_browser_rendering": False,
"changes_fingerprint": True, # Treated like viewport.
},
"cookieManagement": {
"default": "auto",
"is_extract_type": False,
"requires_browser_rendering": False,
"changes_fingerprint": False, # Treated like headers.
},
"requestCookies": {
"default": [],
"is_extract_type": False,
"requires_browser_rendering": False,
"changes_fingerprint": False, # Treated like headers.
},
"responseCookies": {
"default": False,
"is_extract_type": False,
"requires_browser_rendering": False,
"changes_fingerprint": True,
},
"experimental": {
"default": {},
"is_extract_type": False,
"requires_browser_rendering": False,
"changes_fingerprint": False,
},
}

_BROWSER_KEYS = {
key for key, value in _REQUEST_PARAMS.items() if value["requires_browser_rendering"]
}
_EXTRACT_KEYS = {
"article",
"articleList",
"articleNavigation",
"product",
"productList",
"productNavigation",
key for key, value in _REQUEST_PARAMS.items() if value["is_extract_type"]
}
_BROWSER_OR_EXTRACT_KEYS = _BROWSER_KEYS | _EXTRACT_KEYS
_DEFAULT_API_PARAMS = {
key: value["default"]
for key, value in _REQUEST_PARAMS.items()
if value["default"] != _NoDefault
}
_BROWSER_KEYS = _EXTRACT_KEYS | {"browserHtml", "screenshot"}
_DEFAULT_API_PARAMS = {key: False for key in _BROWSER_KEYS}

_DEFAULT_ACCEPT_ENCODING = ", ".join(
encoding.decode() for encoding in ACCEPTED_ENCODINGS
)


def _uses_browser(api_params: Dict[str, Any]) -> bool:
for key in _BROWSER_KEYS:
if api_params.get(key, _REQUEST_PARAMS[key]["default"]):
return True
for key in _EXTRACT_KEYS:
options = api_params.get(f"{key}Options", {})
extract_from = options.get("extractFrom", None)
if extract_from == "browserHtml":
return True
# Note: This could be a “maybe”, e.g. if no extractFrom is specified, a
# extract key could be triggering browser rendering.
return False


def _iter_headers(
*,
api_params: Dict[str, Any],
Expand Down Expand Up @@ -149,7 +401,7 @@ def _set_request_headers_from_request(
api_params.pop("customHttpRequestHeaders")

if (
(not response_body or any(api_params.get(k) for k in _BROWSER_KEYS))
(not response_body or any(api_params.get(k) for k in _BROWSER_OR_EXTRACT_KEYS))
and request_headers is not False
or request_headers is True
):
Expand All @@ -167,7 +419,7 @@ def _set_http_response_body_from_request(
api_params: Dict[str, Any],
request: Request,
):
if not any(api_params.get(k) for k in _BROWSER_KEYS):
if not any(api_params.get(k) for k in _BROWSER_OR_EXTRACT_KEYS):
api_params.setdefault("httpResponseBody", True)
elif api_params.get("httpResponseBody") is False:
logger.warning(
Expand Down Expand Up @@ -319,14 +571,20 @@ def _set_http_request_body_from_request(
api_params["httpRequestBody"] = base64_body


_Undefined = object()


def _unset_unneeded_api_params(
*,
api_params: Dict[str, Any],
default_params: Dict[str, Any],
request: Request,
):
for param, default_value in _DEFAULT_API_PARAMS.items():
if api_params.get(param) != default_value:
value = api_params.get(param, _Undefined)
if value is _Undefined:
continue
if value != default_value:
continue
if param not in default_params or default_params.get(param) == default_value:
logger.warning(
Expand Down
Loading

0 comments on commit 702cc63

Please sign in to comment.