Skip to content

Commit

Permalink
Complete the mapping of all parameters
Browse files Browse the repository at this point in the history
  • Loading branch information
Gallaecio committed Nov 17, 2023
1 parent 39c4b01 commit 42e7402
Show file tree
Hide file tree
Showing 3 changed files with 247 additions and 49 deletions.
191 changes: 188 additions & 3 deletions scrapy_zyte_api/_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,58 +17,237 @@

_NoDefault = object()

# Map of all known Zyte API request params and how they need to be handled.
# Sorted by appearance in https://docs.zyte.com/zyte-api/usage/reference.html.
# Map of all known root Zyte API request params and how they need to be
# handled. Sorted by appearance in
# https://docs.zyte.com/zyte-api/usage/reference.html.
_REQUEST_PARAMS = {
"url": {
"default": _NoDefault,
"is_extract_type": False,
"requires_browser_rendering": False,
"changes_fingerprint": True,
},
"requestHeaders": {
"default": {},
"is_extract_type": False,
"requires_browser_rendering": False,
"changes_fingerprint": False,
},
"httpRequestMethod": {
"default": "GET",
"is_extract_type": False,
"requires_browser_rendering": False,
"changes_fingerprint": True,
},
"httpRequestBody": {
"default": "",
"is_extract_type": False,
"requires_browser_rendering": False,
"changes_fingerprint": True,
},
"httpRequestText": {
"default": "",
"is_extract_type": False,
"requires_browser_rendering": False,
"changes_fingerprint": True,
},
"customHttpRequestHeaders": {
"default": [],
"is_extract_type": False,
"requires_browser_rendering": False,
"changes_fingerprint": False,
},
"httpResponseBody": {
"default": False,
"is_extract_type": False,
"requires_browser_rendering": False,
"changes_fingerprint": True,
},
"httpResponseHeaders": {
"default": False,
"is_extract_type": False,
"requires_browser_rendering": False,
"changes_fingerprint": True,
},
"browserHtml": {
"default": False,
"is_extract_type": False,
"requires_browser_rendering": True,
"changes_fingerprint": True,
},
"screenshot": {
"default": False,
"is_extract_type": False,
"requires_browser_rendering": True,
"changes_fingerprint": True,
},
"screenshotOptions": {
"default": {},
"is_extract_type": False,
"requires_browser_rendering": False, # Not on its own.
"changes_fingerprint": True,
},
"article": {
"default": False,
"is_extract_type": True,
"requires_browser_rendering": False,
"changes_fingerprint": True,
},
"articleOptions": {
"default": {},
"is_extract_type": False, # Not on its own.
"requires_browser_rendering": False,
"changes_fingerprint": True,
},
"articleList": {
"default": False,
"is_extract_type": True,
"requires_browser_rendering": False,
"changes_fingerprint": True,
},
"articleListOptions": {
"default": {},
"is_extract_type": False, # Not on its own.
"requires_browser_rendering": False,
"changes_fingerprint": True,
},
"articleNavigation": {
"default": False,
"is_extract_type": True,
"requires_browser_rendering": False,
"changes_fingerprint": True,
},
"articleNavigationOptions": {
"default": {},
"is_extract_type": False, # Not on its own.
"requires_browser_rendering": False,
"changes_fingerprint": True,
},
"jobPosting": {
"default": False,
"is_extract_type": True,
"requires_browser_rendering": False,
"changes_fingerprint": True,
},
"jobPostingOptions": {
"default": {},
"is_extract_type": False, # Not on its own.
"requires_browser_rendering": False,
"changes_fingerprint": True,
},
"product": {
"default": False,
"is_extract_type": True,
"requires_browser_rendering": False,
"changes_fingerprint": True,
},
"productOptions": {
"default": {},
"is_extract_type": False, # Not on its own.
"requires_browser_rendering": False,
"changes_fingerprint": True,
},
"productList": {
"default": False,
"is_extract_type": True,
"requires_browser_rendering": False,
"changes_fingerprint": True,
},
"productListOptions": {
"default": {},
"is_extract_type": False, # Not on its own.
"requires_browser_rendering": False,
"changes_fingerprint": True,
},
"productNavigation": {
"default": False,
"is_extract_type": True,
"requires_browser_rendering": False,
"changes_fingerprint": True,
},
"productNavigationOptions": {
"default": {},
"is_extract_type": False, # Not on its own.
"requires_browser_rendering": False,
"changes_fingerprint": True,
},
"geolocation": {
"default": None,
"is_extract_type": False,
"requires_browser_rendering": False,
"changes_fingerprint": True,
},
"javascript": {
"default": None,
"is_extract_type": False,
"requires_browser_rendering": False, # Not on its own.
"changes_fingerprint": True,
},
"actions": {
"default": [],
"is_extract_type": False,
"requires_browser_rendering": False, # Not on its own.
"changes_fingerprint": True,
},
"jobId": {
"default": None,
"is_extract_type": False,
"requires_browser_rendering": False,
"changes_fingerprint": False,
},
"echoData": {
"default": None,
"is_extract_type": False,
"requires_browser_rendering": False,
"changes_fingerprint": False,
},
"viewport": {
"default": {},
"is_extract_type": False,
"requires_browser_rendering": False,
"changes_fingerprint": True,
},
"sessionContext": {
"default": [],
"is_extract_type": False,
"requires_browser_rendering": False,
"changes_fingerprint": False, # Treated like headers.
},
"sessionContextParameters": {
"default": {},
"is_extract_type": False,
"requires_browser_rendering": False,
"changes_fingerprint": False, # Treated like sessionContext.
},
"device": {
"default": "auto",
"is_extract_type": False,
"requires_browser_rendering": False,
"changes_fingerprint": True, # Treated like viewport.
},
"cookieManagement": {
"default": "auto",
"is_extract_type": False,
"requires_browser_rendering": False,
"changes_fingerprint": False, # Treated like headers.
},
"requestCookies": {
"default": [],
"is_extract_type": False,
"requires_browser_rendering": False,
"changes_fingerprint": False, # Treated like headers.
},
"responseCookies": {
"default": False,
"is_extract_type": False,
"requires_browser_rendering": False,
"changes_fingerprint": False,
},
"experimental": {
"default": {},
"is_extract_type": False,
"requires_browser_rendering": False,
"changes_fingerprint": False,
},
}

Expand Down Expand Up @@ -392,14 +571,20 @@ def _set_http_request_body_from_request(
api_params["httpRequestBody"] = base64_body


_Undefined = object()


def _unset_unneeded_api_params(
*,
api_params: Dict[str, Any],
default_params: Dict[str, Any],
request: Request,
):
for param, default_value in _DEFAULT_API_PARAMS.items():
if api_params.get(param) != default_value:
value = api_params.get(param, _Undefined)
if value is _Undefined:
continue
if value != default_value:
continue
if param not in default_params or default_params.get(param) == default_value:
logger.warning(
Expand Down
30 changes: 17 additions & 13 deletions scrapy_zyte_api/_request_fingerprinter.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,15 @@
else:
import hashlib
import json
from base64 import b64encode
from weakref import WeakKeyDictionary

from scrapy import Request
from scrapy.settings.default_settings import REQUEST_FINGERPRINTER_CLASS
from scrapy.utils.misc import create_instance, load_object
from w3lib.url import canonicalize_url

from ._params import _ParamParser, _uses_browser
from ._params import _REQUEST_PARAMS, _ParamParser, _uses_browser

class ScrapyZyteAPIRequestFingerprinter:
@classmethod
Expand All @@ -36,25 +37,28 @@ def __init__(self, crawler):
)
self._cache: "WeakKeyDictionary[Request, bytes]" = WeakKeyDictionary()
self._param_parser = _ParamParser(crawler, cookies_enabled=False)
self._skip_keys = (
"customHttpRequestHeaders",
"echoData",
"jobId",
"requestHeaders",
"experimental",

def _normalize_params(self, api_params):
api_params["url"] = canonicalize_url(
api_params["url"],
keep_fragments=_uses_browser(api_params),
)

if "httpRequestText" in api_params:
api_params["httpRequestBody"] = b64encode(
api_params["httpRequestText"].encode()
).decode()

for key, value in _REQUEST_PARAMS.items():
if value["changes_fingerprint"] is False:
api_params.pop(key, None)

def fingerprint(self, request):
if request in self._cache:
return self._cache[request]
api_params = self._param_parser.parse(request)
if api_params is not None:
api_params["url"] = canonicalize_url(
api_params["url"],
keep_fragments=_uses_browser(api_params),
)
for key in self._skip_keys:
api_params.pop(key, None)
self._normalize_params(api_params)
fingerprint_json = json.dumps(api_params, sort_keys=True)
self._cache[request] = hashlib.sha1(fingerprint_json.encode()).digest()
return self._cache[request]
Expand Down
Loading

0 comments on commit 42e7402

Please sign in to comment.