From 42e740238eaddc54ef0233d487a605558de02be6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 17 Nov 2023 13:31:32 +0100 Subject: [PATCH] Complete the mapping of all parameters --- scrapy_zyte_api/_params.py | 191 +++++++++++++++++++++- scrapy_zyte_api/_request_fingerprinter.py | 30 ++-- tests/test_api_requests.py | 75 +++++---- 3 files changed, 247 insertions(+), 49 deletions(-) diff --git a/scrapy_zyte_api/_params.py b/scrapy_zyte_api/_params.py index 93ff57df..4c585f00 100644 --- a/scrapy_zyte_api/_params.py +++ b/scrapy_zyte_api/_params.py @@ -17,58 +17,237 @@ _NoDefault = object() -# Map of all known Zyte API request params and how they need to be handled. -# Sorted by appearance in https://docs.zyte.com/zyte-api/usage/reference.html. +# Map of all known root Zyte API request params and how they need to be +# handled. Sorted by appearance in +# https://docs.zyte.com/zyte-api/usage/reference.html. _REQUEST_PARAMS = { "url": { "default": _NoDefault, "is_extract_type": False, "requires_browser_rendering": False, + "changes_fingerprint": True, + }, + "requestHeaders": { + "default": {}, + "is_extract_type": False, + "requires_browser_rendering": False, + "changes_fingerprint": False, + }, + "httpRequestMethod": { + "default": "GET", + "is_extract_type": False, + "requires_browser_rendering": False, + "changes_fingerprint": True, + }, + "httpRequestBody": { + "default": "", + "is_extract_type": False, + "requires_browser_rendering": False, + "changes_fingerprint": True, + }, + "httpRequestText": { + "default": "", + "is_extract_type": False, + "requires_browser_rendering": False, + "changes_fingerprint": True, + }, + "customHttpRequestHeaders": { + "default": [], + "is_extract_type": False, + "requires_browser_rendering": False, + "changes_fingerprint": False, + }, + "httpResponseBody": { + "default": False, + "is_extract_type": False, + "requires_browser_rendering": False, + "changes_fingerprint": True, + }, + "httpResponseHeaders": { + "default": False, + "is_extract_type": False, + "requires_browser_rendering": False, + "changes_fingerprint": True, }, "browserHtml": { "default": False, "is_extract_type": False, "requires_browser_rendering": True, + "changes_fingerprint": True, }, "screenshot": { "default": False, "is_extract_type": False, "requires_browser_rendering": True, + "changes_fingerprint": True, + }, + "screenshotOptions": { + "default": {}, + "is_extract_type": False, + "requires_browser_rendering": False, # Not on its own. + "changes_fingerprint": True, }, "article": { "default": False, "is_extract_type": True, "requires_browser_rendering": False, + "changes_fingerprint": True, + }, + "articleOptions": { + "default": {}, + "is_extract_type": False, # Not on its own. + "requires_browser_rendering": False, + "changes_fingerprint": True, }, "articleList": { "default": False, "is_extract_type": True, "requires_browser_rendering": False, + "changes_fingerprint": True, + }, + "articleListOptions": { + "default": {}, + "is_extract_type": False, # Not on its own. + "requires_browser_rendering": False, + "changes_fingerprint": True, }, "articleNavigation": { "default": False, "is_extract_type": True, "requires_browser_rendering": False, + "changes_fingerprint": True, + }, + "articleNavigationOptions": { + "default": {}, + "is_extract_type": False, # Not on its own. + "requires_browser_rendering": False, + "changes_fingerprint": True, }, "jobPosting": { "default": False, "is_extract_type": True, "requires_browser_rendering": False, + "changes_fingerprint": True, + }, + "jobPostingOptions": { + "default": {}, + "is_extract_type": False, # Not on its own. + "requires_browser_rendering": False, + "changes_fingerprint": True, }, "product": { "default": False, "is_extract_type": True, "requires_browser_rendering": False, + "changes_fingerprint": True, + }, + "productOptions": { + "default": {}, + "is_extract_type": False, # Not on its own. + "requires_browser_rendering": False, + "changes_fingerprint": True, }, "productList": { "default": False, "is_extract_type": True, "requires_browser_rendering": False, + "changes_fingerprint": True, + }, + "productListOptions": { + "default": {}, + "is_extract_type": False, # Not on its own. + "requires_browser_rendering": False, + "changes_fingerprint": True, }, "productNavigation": { "default": False, "is_extract_type": True, "requires_browser_rendering": False, + "changes_fingerprint": True, + }, + "productNavigationOptions": { + "default": {}, + "is_extract_type": False, # Not on its own. + "requires_browser_rendering": False, + "changes_fingerprint": True, + }, + "geolocation": { + "default": None, + "is_extract_type": False, + "requires_browser_rendering": False, + "changes_fingerprint": True, + }, + "javascript": { + "default": None, + "is_extract_type": False, + "requires_browser_rendering": False, # Not on its own. + "changes_fingerprint": True, + }, + "actions": { + "default": [], + "is_extract_type": False, + "requires_browser_rendering": False, # Not on its own. + "changes_fingerprint": True, + }, + "jobId": { + "default": None, + "is_extract_type": False, + "requires_browser_rendering": False, + "changes_fingerprint": False, + }, + "echoData": { + "default": None, + "is_extract_type": False, + "requires_browser_rendering": False, + "changes_fingerprint": False, + }, + "viewport": { + "default": {}, + "is_extract_type": False, + "requires_browser_rendering": False, + "changes_fingerprint": True, + }, + "sessionContext": { + "default": [], + "is_extract_type": False, + "requires_browser_rendering": False, + "changes_fingerprint": False, # Treated like headers. + }, + "sessionContextParameters": { + "default": {}, + "is_extract_type": False, + "requires_browser_rendering": False, + "changes_fingerprint": False, # Treated like sessionContext. + }, + "device": { + "default": "auto", + "is_extract_type": False, + "requires_browser_rendering": False, + "changes_fingerprint": True, # Treated like viewport. + }, + "cookieManagement": { + "default": "auto", + "is_extract_type": False, + "requires_browser_rendering": False, + "changes_fingerprint": False, # Treated like headers. + }, + "requestCookies": { + "default": [], + "is_extract_type": False, + "requires_browser_rendering": False, + "changes_fingerprint": False, # Treated like headers. + }, + "responseCookies": { + "default": False, + "is_extract_type": False, + "requires_browser_rendering": False, + "changes_fingerprint": False, + }, + "experimental": { + "default": {}, + "is_extract_type": False, + "requires_browser_rendering": False, + "changes_fingerprint": False, }, } @@ -392,6 +571,9 @@ def _set_http_request_body_from_request( api_params["httpRequestBody"] = base64_body +_Undefined = object() + + def _unset_unneeded_api_params( *, api_params: Dict[str, Any], @@ -399,7 +581,10 @@ def _unset_unneeded_api_params( request: Request, ): for param, default_value in _DEFAULT_API_PARAMS.items(): - if api_params.get(param) != default_value: + value = api_params.get(param, _Undefined) + if value is _Undefined: + continue + if value != default_value: continue if param not in default_params or default_params.get(param) == default_value: logger.warning( diff --git a/scrapy_zyte_api/_request_fingerprinter.py b/scrapy_zyte_api/_request_fingerprinter.py index c4a2d7c6..0fc138a2 100644 --- a/scrapy_zyte_api/_request_fingerprinter.py +++ b/scrapy_zyte_api/_request_fingerprinter.py @@ -8,6 +8,7 @@ else: import hashlib import json + from base64 import b64encode from weakref import WeakKeyDictionary from scrapy import Request @@ -15,7 +16,7 @@ from scrapy.utils.misc import create_instance, load_object from w3lib.url import canonicalize_url - from ._params import _ParamParser, _uses_browser + from ._params import _REQUEST_PARAMS, _ParamParser, _uses_browser class ScrapyZyteAPIRequestFingerprinter: @classmethod @@ -36,25 +37,28 @@ def __init__(self, crawler): ) self._cache: "WeakKeyDictionary[Request, bytes]" = WeakKeyDictionary() self._param_parser = _ParamParser(crawler, cookies_enabled=False) - self._skip_keys = ( - "customHttpRequestHeaders", - "echoData", - "jobId", - "requestHeaders", - "experimental", + + def _normalize_params(self, api_params): + api_params["url"] = canonicalize_url( + api_params["url"], + keep_fragments=_uses_browser(api_params), ) + if "httpRequestText" in api_params: + api_params["httpRequestBody"] = b64encode( + api_params["httpRequestText"].encode() + ).decode() + + for key, value in _REQUEST_PARAMS.items(): + if value["changes_fingerprint"] is False: + api_params.pop(key, None) + def fingerprint(self, request): if request in self._cache: return self._cache[request] api_params = self._param_parser.parse(request) if api_params is not None: - api_params["url"] = canonicalize_url( - api_params["url"], - keep_fragments=_uses_browser(api_params), - ) - for key in self._skip_keys: - api_params.pop(key, None) + self._normalize_params(api_params) fingerprint_json = json.dumps(api_params, sort_keys=True) self._cache[request] = hashlib.sha1(fingerprint_json.encode()).digest() return self._cache[request] diff --git a/tests/test_api_requests.py b/tests/test_api_requests.py index 4d861f36..5bf8aba8 100644 --- a/tests/test_api_requests.py +++ b/tests/test_api_requests.py @@ -938,43 +938,52 @@ def test_automap_header_output(meta, expected, warnings, caplog): # If httpRequestMethod is also specified in meta with the same value # as Request.method, a warning is logged asking to use only # Request.method. - *( - ( - request_method, - {"httpRequestMethod": meta_method}, - { - "httpResponseBody": True, - "httpResponseHeaders": True, - "httpRequestMethod": meta_method, - }, - ["Use Request.method"], - ) - for request_method, meta_method in ( - ("GET", "GET"), - ("POST", "POST"), - ) + ( + None, + {"httpRequestMethod": "GET"}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + ["Use Request.method"], + ), + ( + "POST", + {"httpRequestMethod": "POST"}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "httpRequestMethod": "POST", + }, + ["Use Request.method"], ), # If httpRequestMethod is also specified in meta with a different value # from Request.method, a warning is logged asking to use Request.meta, # and the meta value takes precedence. - *( - ( - request_method, - {"httpRequestMethod": meta_method}, - { - "httpResponseBody": True, - "httpResponseHeaders": True, - "httpRequestMethod": meta_method, - }, - [ - "Use Request.method", - "does not match the Zyte API httpRequestMethod", - ], - ) - for request_method, meta_method in ( - ("GET", "POST"), - ("PUT", "GET"), - ) + ( + "POST", + {"httpRequestMethod": "GET"}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + }, + [ + "Use Request.method", + "does not match the Zyte API httpRequestMethod", + ], + ), + ( + "POST", + {"httpRequestMethod": "PUT"}, + { + "httpResponseBody": True, + "httpResponseHeaders": True, + "httpRequestMethod": "PUT", + }, + [ + "Use Request.method", + "does not match the Zyte API httpRequestMethod", + ], ), # If httpResponseBody is not True, implicitly or explicitly, # Request.method is still mapped for anything other than GET.