diff --git a/docs/usage/manual.rst b/docs/usage/manual.rst index 525c530c..8293caf2 100644 --- a/docs/usage/manual.rst +++ b/docs/usage/manual.rst @@ -64,4 +64,4 @@ remember to also request :http:`request:httpResponseHeaders`: # "…" To learn more about Zyte API parameters, see the upstream :ref:`usage -` and :ref:`API reference ` pages. +` and :ref:`API reference ` pages. diff --git a/scrapy_zyte_api/responses.py b/scrapy_zyte_api/responses.py index 7e223131..1a8c4cef 100644 --- a/scrapy_zyte_api/responses.py +++ b/scrapy_zyte_api/responses.py @@ -1,4 +1,5 @@ from base64 import b64decode +from copy import copy from datetime import datetime from typing import Any, Dict, List, Optional, Tuple, Union @@ -22,7 +23,7 @@ class ZyteAPIMixin: # Zyte API already decompresses the HTTP Response Body. Scrapy's # HttpCompressionMiddleware will error out when it attempts to # decompress an already decompressed body based on this header. - "content-encoding" + "content-encoding", } def __init__(self, *args, raw_api_response: Optional[Dict] = None, **kwargs): @@ -56,7 +57,7 @@ def replace(self, *args, **kwargs): def raw_api_response(self) -> Optional[Dict]: """Contains the raw API response from Zyte API. - For the full list of parameters, see :ref:`zyte-api-http-api`. + For the full list of parameters, see :ref:`zyte-api-reference`. """ return self._raw_api_response @@ -89,18 +90,21 @@ def _prepare_headers(cls, api_response: Dict[str, Any]): input_headers: Optional[List[Dict[str, str]]] = api_response.get( "httpResponseHeaders" ) + response_cookies: Optional[List[Dict[str, str]]] = api_response.get( + "experimental", {} + ).get("responseCookies") if input_headers: + headers_to_remove = copy(cls.REMOVE_HEADERS) + if response_cookies: + headers_to_remove.add("set-cookie") result = { h["name"]: [h["value"]] for h in input_headers - if h["name"].lower() not in cls.REMOVE_HEADERS + if h["name"].lower() not in headers_to_remove } - input_cookies: Optional[List[Dict[str, str]]] = api_response.get( - "experimental", {} - ).get("responseCookies") - if input_cookies: + if response_cookies: result["Set-Cookie"] = [] - for cookie in input_cookies: + for cookie in response_cookies: result["Set-Cookie"].append( cls._response_cookie_to_header_value(cookie) ) diff --git a/tests/test_responses.py b/tests/test_responses.py index 9705a6fc..e35a079f 100644 --- a/tests/test_responses.py +++ b/tests/test_responses.py @@ -1,6 +1,7 @@ from base64 import b64encode from collections import defaultdict from functools import partial +from typing import Any, Dict, cast import pytest from scrapy import Request @@ -235,6 +236,7 @@ def test_response_headers_removal(api_response, cls): """ additional_headers = [ {"name": "Content-Encoding", "value": "gzip"}, + {"name": "Set-Cookie", "value": "a=b"}, {"name": "X-Some-Other-Value", "value": "123"}, ] raw_response = api_response() @@ -242,16 +244,96 @@ def test_response_headers_removal(api_response, cls): response = cls.from_api_response(raw_response) - assert response.headers == { + expected_headers = { b"X-Some-Other-Value": [b"123"], **OUTPUT_COOKIE_HEADERS, } + assert response.headers == expected_headers assert ( response.raw_api_response["httpResponseHeaders"] == raw_response["httpResponseHeaders"] ) +INPUT_COOKIES_SIMPLE = [{"name": "c", "value": "d"}] + + +@pytest.mark.parametrize( + "fields,cls,keep", + [ + # Only keep the Set-Cookie header if experimental.responseCookies is + # not received. + *( + ( + { + **cast(Dict[Any, Any], output_fields), + "httpResponseHeaders": [ + {"name": "Content-Type", "value": "text/html"}, + {"name": "Content-Length", "value": str(len(PAGE_CONTENT))}, + ], + **cookie_fields, # type: ignore[dict-item] + }, + response_cls, + keep, + ) + for output_fields, response_cls in ( + ( + {"httpResponseBody": b64encode(PAGE_CONTENT.encode("utf-8"))}, + ZyteAPIResponse, + ), + ( + { + "browserHtml": PAGE_CONTENT, + }, + ZyteAPITextResponse, + ), + ) + for cookie_fields, keep in ( + # No response cookies, so Set-Cookie is kept. + ( + {}, + True, + ), + # Response cookies, so Set-Cookie is not kept. + ( + { + "experimental": { + "responseCookies": INPUT_COOKIES_SIMPLE, + }, + }, + False, + ), + ) + ), + ], +) +def test_response_cookie_header(fields, cls, keep): + """Test the logic to keep or not the Set-Cookie header in response + headers.""" + expected_headers = { + **{ + header["name"].encode(): [header["value"].encode()] + for header in fields["httpResponseHeaders"] + }, + } + if keep: + expected_headers[b"Set-Cookie"] = [b"a=b"] + elif "experimental" in fields: + expected_headers[b"Set-Cookie"] = [b"c=d"] + + fields["url"] = "https://example.com" + fields["statusCode"] = 200 + fields["httpResponseHeaders"].append({"name": "Set-Cookie", "value": "a=b"}) + + response = cls.from_api_response(fields) + + assert response.headers == expected_headers + assert ( + response.raw_api_response["httpResponseHeaders"] + == fields["httpResponseHeaders"] + ) + + def test__process_response_no_body(): """The _process_response() function should handle missing 'browserHtml' or 'httpResponseBody'.