Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support AnyResponse #161

Merged
merged 25 commits into from
Feb 8, 2024
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
809e179
support HttpOrBrowserResponse
BurnzZ Jan 16, 2024
2f5c69d
use new AnyResponse instead of HttpOrBrowserResponse
BurnzZ Jan 16, 2024
b4a79b0
use zapi response contents to determine which response to build
BurnzZ Jan 16, 2024
5c83d22
add provider tests for AnyResponse
BurnzZ Jan 17, 2024
202fa1b
fix mypy
BurnzZ Jan 17, 2024
5f1d104
add more test cases
BurnzZ Jan 17, 2024
3cb2290
use new weak_ref in scrapy_poet's Injector to handle more cases
BurnzZ Jan 18, 2024
85f355d
BrowserResponse takes precedence in AnyResponse
BurnzZ Jan 19, 2024
297eb57
add more test cases
BurnzZ Jan 19, 2024
1e19ef3
docs for AnyResponse fulfillment
BurnzZ Jan 19, 2024
137d146
AnyResponse would request HttpResponse if BrowserResponse/HttpRespons…
BurnzZ Jan 19, 2024
9322622
improve tests
BurnzZ Jan 22, 2024
ffa345f
small comments and improvements
BurnzZ Jan 24, 2024
7043c55
add test for item return
BurnzZ Jan 26, 2024
3fec1b8
handle empty itemOptions
BurnzZ Jan 31, 2024
b341976
avoid using ParamParser
BurnzZ Jan 31, 2024
60f5c2b
Merge branch 'main' of ssh://github.com/scrapy-plugins/scrapy-zyte-ap…
BurnzZ Jan 31, 2024
2c5e506
use web-poet>=0.16.0
BurnzZ Jan 31, 2024
6c1e043
temporarily use scrapy-poet master branch
BurnzZ Jan 31, 2024
571ee63
Merge branch 'main' of ssh://github.com/scrapy-plugins/scrapy-zyte-ap…
BurnzZ Feb 2, 2024
a51f961
use browserHtml if no extraction source is provided with item types
BurnzZ Feb 5, 2024
9da17b4
remove cache updates since Injector is already doing it
BurnzZ Feb 7, 2024
d389893
Update tests on instance identity
BurnzZ Feb 8, 2024
df32f14
remove duplicate test cases for AnyResponse
BurnzZ Feb 8, 2024
382dced
use scrapy-poet 0.21.0
BurnzZ Feb 8, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 75 additions & 17 deletions scrapy_zyte_api/providers.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,17 @@
from typing import Any, Callable, Dict, List, Sequence, Set
from weakref import WeakKeyDictionary

from andi.typeutils import is_typing_annotated, strip_annotated
from scrapy import Request
from scrapy.crawler import Crawler
from scrapy.utils.defer import maybe_deferred_to_future
from scrapy_poet import AnnotatedResult, PageObjectInputProvider
from web_poet import BrowserHtml, BrowserResponse
from web_poet import (

Check warning on line 8 in scrapy_zyte_api/providers.py

View check run for this annotation

Codecov / codecov/patch

scrapy_zyte_api/providers.py#L8

Added line #L8 was not covered by tests
AnyResponse,
BrowserHtml,
BrowserResponse,
HttpResponse,
HttpResponseHeaders,
)
from zyte_common_items import (
Article,
ArticleList,
Expand All @@ -19,6 +24,7 @@
)

from scrapy_zyte_api._annotations import ExtractFrom
from scrapy_zyte_api._params import _ParamParser

Check warning on line 27 in scrapy_zyte_api/providers.py

View check run for this annotation

Codecov / codecov/patch

scrapy_zyte_api/providers.py#L27

Added line #L27 was not covered by tests
from scrapy_zyte_api.responses import ZyteAPITextResponse

try:
Expand All @@ -40,33 +46,39 @@
Article,
ArticleList,
ArticleNavigation,
AnyResponse,
JobPosting,
}

def __init__(self, injector):
super().__init__(injector)
self._cached_instances: WeakKeyDictionary[Request, Dict] = WeakKeyDictionary()

def is_provided(self, type_: Callable) -> bool:
return super().is_provided(strip_annotated(type_))

def update_cache(self, request: Request, mapping: Dict[Any, Any]) -> None:
if request not in self._cached_instances:
self._cached_instances[request] = {}
self._cached_instances[request].update(mapping)
if request not in self.injector.weak_cache:
self.injector.weak_cache[request] = {}
self.injector.weak_cache[request].update(mapping)

Check warning on line 59 in scrapy_zyte_api/providers.py

View check run for this annotation

Codecov / codecov/patch

scrapy_zyte_api/providers.py#L57-L59

Added lines #L57 - L59 were not covered by tests
BurnzZ marked this conversation as resolved.
Show resolved Hide resolved

async def __call__( # noqa: C901
self, to_provide: Set[Callable], request: Request, crawler: Crawler
) -> Sequence[Any]:
"""Makes a Zyte API request to provide BrowserResponse and/or item dependencies."""
# TODO what if ``response`` is already from Zyte API and contains something we need
results: List[Any] = []

for cls in list(to_provide):
item = self._cached_instances.get(request, {}).get(cls)
item = self.injector.weak_cache.get(request, {}).get(cls)

Check warning on line 68 in scrapy_zyte_api/providers.py

View check run for this annotation

Codecov / codecov/patch

scrapy_zyte_api/providers.py#L68

Added line #L68 was not covered by tests
if item:
results.append(item)
to_provide.remove(cls)
elif cls == AnyResponse:
http_response = self.injector.weak_cache.get(request, {}).get(

Check warning on line 73 in scrapy_zyte_api/providers.py

View check run for this annotation

Codecov / codecov/patch

scrapy_zyte_api/providers.py#L72-L73

Added lines #L72 - L73 were not covered by tests
HttpResponse
)
BurnzZ marked this conversation as resolved.
Show resolved Hide resolved
if http_response:
any_response = AnyResponse(response=http_response)
results.append(any_response)
self.update_cache(request, {AnyResponse: any_response})
to_provide.remove(cls)

Check warning on line 80 in scrapy_zyte_api/providers.py

View check run for this annotation

Codecov / codecov/patch

scrapy_zyte_api/providers.py#L76-L80

Added lines #L76 - L80 were not covered by tests

if not to_provide:
return results

Expand All @@ -82,8 +94,6 @@
}

zyte_api_meta = crawler.settings.getdict("ZYTE_API_PROVIDER_PARAMS")
if html_requested:
zyte_api_meta["browserHtml"] = True

to_provide_stripped: Set[type] = set()
extract_from_seen: Dict[str, str] = {}
Expand Down Expand Up @@ -112,10 +122,25 @@
options["extractFrom"] = extract_from.value
break

extract_from = None # type: ignore[assignment]

Check warning on line 125 in scrapy_zyte_api/providers.py

View check run for this annotation

Codecov / codecov/patch

scrapy_zyte_api/providers.py#L125

Added line #L125 was not covered by tests
for item_type, kw in item_keywords.items():
options_name = f"{kw}Options"
if item_type not in to_provide_stripped and options_name in zyte_api_meta:
del zyte_api_meta[options_name]
BurnzZ marked this conversation as resolved.
Show resolved Hide resolved
elif options_name in zyte_api_meta:
extract_from = zyte_api_meta[options_name]["extractFrom"]

Check warning on line 131 in scrapy_zyte_api/providers.py

View check run for this annotation

Codecov / codecov/patch

scrapy_zyte_api/providers.py#L130-L131

Added lines #L130 - L131 were not covered by tests

if AnyResponse in to_provide:
if extract_from == "browserHtml":
html_requested = True
BurnzZ marked this conversation as resolved.
Show resolved Hide resolved
elif extract_from == "httpResponseBody":
param_parser = _ParamParser(crawler)
http_request_params = param_parser.parse(request)
del http_request_params["url"]
zyte_api_meta.update(http_request_params)

Check warning on line 140 in scrapy_zyte_api/providers.py

View check run for this annotation

Codecov / codecov/patch

scrapy_zyte_api/providers.py#L133-L140

Added lines #L133 - L140 were not covered by tests

if html_requested:
zyte_api_meta["browserHtml"] = True

Check warning on line 143 in scrapy_zyte_api/providers.py

View check run for this annotation

Codecov / codecov/patch

scrapy_zyte_api/providers.py#L142-L143

Added lines #L142 - L143 were not covered by tests

api_request = Request(
url=request.url,
Expand All @@ -137,14 +162,47 @@
if BrowserHtml in to_provide:
results.append(html)
self.update_cache(request, {BrowserHtml: html})

browser_response = None

Check warning on line 166 in scrapy_zyte_api/providers.py

View check run for this annotation

Codecov / codecov/patch

scrapy_zyte_api/providers.py#L166

Added line #L166 was not covered by tests
if BrowserResponse in to_provide:
response = BrowserResponse(
browser_response = BrowserResponse(

Check warning on line 168 in scrapy_zyte_api/providers.py

View check run for this annotation

Codecov / codecov/patch

scrapy_zyte_api/providers.py#L168

Added line #L168 was not covered by tests
url=api_response.url,
status=api_response.status,
html=html,
)
results.append(response)
self.update_cache(request, {BrowserResponse: response})
results.append(browser_response)
self.update_cache(request, {BrowserResponse: browser_response})

Check warning on line 174 in scrapy_zyte_api/providers.py

View check run for this annotation

Codecov / codecov/patch

scrapy_zyte_api/providers.py#L173-L174

Added lines #L173 - L174 were not covered by tests

if AnyResponse in to_provide:
any_response = None # type: ignore[assignment]

Check warning on line 177 in scrapy_zyte_api/providers.py

View check run for this annotation

Codecov / codecov/patch

scrapy_zyte_api/providers.py#L176-L177

Added lines #L176 - L177 were not covered by tests

if "browserHtml" in api_response.raw_api_response:
any_response = AnyResponse(

Check warning on line 180 in scrapy_zyte_api/providers.py

View check run for this annotation

Codecov / codecov/patch

scrapy_zyte_api/providers.py#L179-L180

Added lines #L179 - L180 were not covered by tests
response=browser_response
or BrowserResponse(
url=api_response.url,
status=api_response.status,
html=html,
)
)
elif (

Check warning on line 188 in scrapy_zyte_api/providers.py

View check run for this annotation

Codecov / codecov/patch

scrapy_zyte_api/providers.py#L188

Added line #L188 was not covered by tests
"httpResponseBody" in api_response.raw_api_response
and "httpResponseHeaders" in api_response.raw_api_response
):
any_response = AnyResponse(

Check warning on line 192 in scrapy_zyte_api/providers.py

View check run for this annotation

Codecov / codecov/patch

scrapy_zyte_api/providers.py#L192

Added line #L192 was not covered by tests
response=HttpResponse(
url=api_response.url,
body=api_response.body,
status=api_response.status,
headers=HttpResponseHeaders.from_bytes_dict(
api_response.headers
),
)
)

if any_response:
results.append(any_response)
self.update_cache(request, {AnyResponse: any_response})

Check warning on line 205 in scrapy_zyte_api/providers.py

View check run for this annotation

Codecov / codecov/patch

scrapy_zyte_api/providers.py#L203-L205

Added lines #L203 - L205 were not covered by tests

for cls in to_provide:
cls_stripped = strip_annotated(cls)
Expand All @@ -153,7 +211,7 @@
if not kw:
continue
assert issubclass(cls_stripped, Item)
item = cls_stripped.from_dict(api_response.raw_api_response[kw])
item = cls_stripped.from_dict(api_response.raw_api_response[kw]) # type: ignore[attr-defined]

Check warning on line 214 in scrapy_zyte_api/providers.py

View check run for this annotation

Codecov / codecov/patch

scrapy_zyte_api/providers.py#L214

Added line #L214 was not covered by tests
if is_typing_annotated(cls):
item = AnnotatedResult(item, cls.__metadata__) # type: ignore[attr-defined]
results.append(item)
Expand Down
6 changes: 4 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,10 @@ def get_version():
# Sync with [testenv:pinned-provider] @ tox.ini
"provider": [
"andi>=0.6.0",
"scrapy-poet>=0.19.0",
"web-poet>=0.15.1",
# "scrapy-poet>=0.19.0",
"scrapy-poet @ git+https://[email protected]/scrapinghub/scrapy-poet@weak-cache#egg=scrapy-poet",
# "web-poet>=0.15.1",
"web-poet @ git+https://[email protected]/scrapinghub/web-poet@response#egg=web-poet",
"zyte-common-items>=0.8.0",
]
},
Expand Down
Loading