Skip to content

Commit

Permalink
Drop the Set-Cookie header (#132)
Browse files Browse the repository at this point in the history
  • Loading branch information
Gallaecio authored Nov 24, 2023
1 parent b892453 commit 493a48c
Show file tree
Hide file tree
Showing 3 changed files with 96 additions and 10 deletions.
2 changes: 1 addition & 1 deletion docs/usage/manual.rst
Original file line number Diff line number Diff line change
Expand Up @@ -64,4 +64,4 @@ remember to also request :http:`request:httpResponseHeaders`:
# "<html>…</html>"
To learn more about Zyte API parameters, see the upstream :ref:`usage
<zyte-api-usage>` and :ref:`API reference <zyte-api-http-api>` pages.
<zyte-api-usage>` and :ref:`API reference <zyte-api-reference>` pages.
20 changes: 12 additions & 8 deletions scrapy_zyte_api/responses.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from base64 import b64decode
from copy import copy
from datetime import datetime
from typing import Any, Dict, List, Optional, Tuple, Union

Expand All @@ -22,7 +23,7 @@ class ZyteAPIMixin:
# Zyte API already decompresses the HTTP Response Body. Scrapy's
# HttpCompressionMiddleware will error out when it attempts to
# decompress an already decompressed body based on this header.
"content-encoding"
"content-encoding",
}

def __init__(self, *args, raw_api_response: Optional[Dict] = None, **kwargs):
Expand Down Expand Up @@ -56,7 +57,7 @@ def replace(self, *args, **kwargs):
def raw_api_response(self) -> Optional[Dict]:
"""Contains the raw API response from Zyte API.
For the full list of parameters, see :ref:`zyte-api-http-api`.
For the full list of parameters, see :ref:`zyte-api-reference`.
"""
return self._raw_api_response

Expand Down Expand Up @@ -89,18 +90,21 @@ def _prepare_headers(cls, api_response: Dict[str, Any]):
input_headers: Optional[List[Dict[str, str]]] = api_response.get(
"httpResponseHeaders"
)
response_cookies: Optional[List[Dict[str, str]]] = api_response.get(
"experimental", {}
).get("responseCookies")
if input_headers:
headers_to_remove = copy(cls.REMOVE_HEADERS)
if response_cookies:
headers_to_remove.add("set-cookie")
result = {
h["name"]: [h["value"]]
for h in input_headers
if h["name"].lower() not in cls.REMOVE_HEADERS
if h["name"].lower() not in headers_to_remove
}
input_cookies: Optional[List[Dict[str, str]]] = api_response.get(
"experimental", {}
).get("responseCookies")
if input_cookies:
if response_cookies:
result["Set-Cookie"] = []
for cookie in input_cookies:
for cookie in response_cookies:
result["Set-Cookie"].append(
cls._response_cookie_to_header_value(cookie)
)
Expand Down
84 changes: 83 additions & 1 deletion tests/test_responses.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from base64 import b64encode
from collections import defaultdict
from functools import partial
from typing import Any, Dict, cast

import pytest
from scrapy import Request
Expand Down Expand Up @@ -235,23 +236,104 @@ def test_response_headers_removal(api_response, cls):
"""
additional_headers = [
{"name": "Content-Encoding", "value": "gzip"},
{"name": "Set-Cookie", "value": "a=b"},
{"name": "X-Some-Other-Value", "value": "123"},
]
raw_response = api_response()
raw_response["httpResponseHeaders"] = additional_headers

response = cls.from_api_response(raw_response)

assert response.headers == {
expected_headers = {
b"X-Some-Other-Value": [b"123"],
**OUTPUT_COOKIE_HEADERS,
}
assert response.headers == expected_headers
assert (
response.raw_api_response["httpResponseHeaders"]
== raw_response["httpResponseHeaders"]
)


INPUT_COOKIES_SIMPLE = [{"name": "c", "value": "d"}]


@pytest.mark.parametrize(
"fields,cls,keep",
[
# Only keep the Set-Cookie header if experimental.responseCookies is
# not received.
*(
(
{
**cast(Dict[Any, Any], output_fields),
"httpResponseHeaders": [
{"name": "Content-Type", "value": "text/html"},
{"name": "Content-Length", "value": str(len(PAGE_CONTENT))},
],
**cookie_fields, # type: ignore[dict-item]
},
response_cls,
keep,
)
for output_fields, response_cls in (
(
{"httpResponseBody": b64encode(PAGE_CONTENT.encode("utf-8"))},
ZyteAPIResponse,
),
(
{
"browserHtml": PAGE_CONTENT,
},
ZyteAPITextResponse,
),
)
for cookie_fields, keep in (
# No response cookies, so Set-Cookie is kept.
(
{},
True,
),
# Response cookies, so Set-Cookie is not kept.
(
{
"experimental": {
"responseCookies": INPUT_COOKIES_SIMPLE,
},
},
False,
),
)
),
],
)
def test_response_cookie_header(fields, cls, keep):
"""Test the logic to keep or not the Set-Cookie header in response
headers."""
expected_headers = {
**{
header["name"].encode(): [header["value"].encode()]
for header in fields["httpResponseHeaders"]
},
}
if keep:
expected_headers[b"Set-Cookie"] = [b"a=b"]
elif "experimental" in fields:
expected_headers[b"Set-Cookie"] = [b"c=d"]

fields["url"] = "https://example.com"
fields["statusCode"] = 200
fields["httpResponseHeaders"].append({"name": "Set-Cookie", "value": "a=b"})

response = cls.from_api_response(fields)

assert response.headers == expected_headers
assert (
response.raw_api_response["httpResponseHeaders"]
== fields["httpResponseHeaders"]
)


def test__process_response_no_body():
"""The _process_response() function should handle missing 'browserHtml' or
'httpResponseBody'.
Expand Down

0 comments on commit 493a48c

Please sign in to comment.