scrapy-plugins · BurnzZ · Feb 8, 2024 · Jan 16, 2024 · Jan 16, 2024 · Jan 16, 2024
diff --git a/docs/reference/inputs.rst b/docs/reference/inputs.rst
@@ -13,6 +13,13 @@ Inputs
 
 -   :class:`web_poet.BrowserResponse`
 
+-   :class:`web_poet.AnyResponse`
+
+    This re-uses either :class:`web_poet.BrowserResponse` *(takes priority)*
+    or :class:`web_poet.HttpResponse` if they're available. If neither is
+    available, it would use :class:`web_poet.HttpResponse` requested from Zyte
+    API.
+
 -   :class:`zyte_common_items.Article`
 
 -   :class:`zyte_common_items.ArticleList`

diff --git a/scrapy_zyte_api/providers.py b/scrapy_zyte_api/providers.py
@@ -1,12 +1,17 @@
 from typing import Any, Callable, Dict, List, Sequence, Set
-from weakref import WeakKeyDictionary
 
 from andi.typeutils import is_typing_annotated, strip_annotated
 from scrapy import Request
 from scrapy.crawler import Crawler
 from scrapy.utils.defer import maybe_deferred_to_future
 from scrapy_poet import AnnotatedResult, PageObjectInputProvider
-from web_poet import BrowserHtml, BrowserResponse
+from web_poet import (
+    AnyResponse,
+    BrowserHtml,
+    BrowserResponse,
+    HttpResponse,
+    HttpResponseHeaders,
+)
 from zyte_common_items import (
     Article,
     ArticleList,
@@ -19,6 +24,7 @@
 )
 
 from scrapy_zyte_api._annotations import ExtractFrom
+from scrapy_zyte_api._params import _ParamParser
 from scrapy_zyte_api.responses import ZyteAPITextResponse
 
 try:
@@ -40,33 +46,42 @@
         Article,
         ArticleList,
         ArticleNavigation,
+        AnyResponse,
         JobPosting,
     }
 
-    def __init__(self, injector):
-        super().__init__(injector)
-        self._cached_instances: WeakKeyDictionary[Request, Dict] = WeakKeyDictionary()
-
     def is_provided(self, type_: Callable) -> bool:
         return super().is_provided(strip_annotated(type_))
 
     def update_cache(self, request: Request, mapping: Dict[Any, Any]) -> None:
-        if request not in self._cached_instances:
-            self._cached_instances[request] = {}
-        self._cached_instances[request].update(mapping)
+        if request not in self.injector.weak_cache:
+            self.injector.weak_cache[request] = {}
+        self.injector.weak_cache[request].update(mapping)
 
     async def __call__(  # noqa: C901
         self, to_provide: Set[Callable], request: Request, crawler: Crawler
     ) -> Sequence[Any]:
         """Makes a Zyte API request to provide BrowserResponse and/or item dependencies."""
-        # TODO what if ``response`` is already from Zyte API and contains something we need
         results: List[Any] = []
 
+        http_response = None
         for cls in list(to_provide):
-            item = self._cached_instances.get(request, {}).get(cls)
+            item = self.injector.weak_cache.get(request, {}).get(cls)
             if item:
                 results.append(item)
                 to_provide.remove(cls)
+
+            # BrowserResponse takes precedence over HttpResponse
+            elif cls == AnyResponse and BrowserResponse not in to_provide:
+                http_response = self.injector.weak_cache.get(request, {}).get(
+                    HttpResponse
+                )
+                if http_response:
+                    any_response = AnyResponse(response=http_response)
+                    results.append(any_response)
+                    self.update_cache(request, {AnyResponse: any_response})
+                    to_provide.remove(cls)
+
         if not to_provide:
             return results
 
@@ -82,8 +97,6 @@
         }
 
         zyte_api_meta = crawler.settings.getdict("ZYTE_API_PROVIDER_PARAMS")
-        if html_requested:
-            zyte_api_meta["browserHtml"] = True
 
         to_provide_stripped: Set[type] = set()
         extract_from_seen: Dict[str, str] = {}
@@ -112,10 +125,36 @@
                     options["extractFrom"] = extract_from.value
                     break
 
+        http_response_needed = (
+            AnyResponse in to_provide
+            and BrowserResponse not in to_provide
+            and BrowserHtml not in to_provide
+            and not http_response
+        )
+
+        extract_from = None  # type: ignore[assignment]
         for item_type, kw in item_keywords.items():
             options_name = f"{kw}Options"
             if item_type not in to_provide_stripped and options_name in zyte_api_meta:
                 del zyte_api_meta[options_name]
+            elif options_name in zyte_api_meta:
+                extract_from = zyte_api_meta[options_name].get("extractFrom")
+            elif item_type in to_provide_stripped and http_response_needed:
+                zyte_api_meta[options_name] = {"extractFrom": "httpResponseBody"}
+
+        if AnyResponse in to_provide:
+            if extract_from == "browserHtml":
+                html_requested = True
+            elif extract_from == "httpResponseBody" or http_response_needed:
+                param_parser = _ParamParser(crawler)
+                param_parser._transparent_mode = True
+                http_request_params = param_parser.parse(request)
+                del http_request_params["url"]
+                zyte_api_meta.update(http_request_params)
+
+        # TODO: Map out RequestHeaders similar to httpResponseBody
+        if html_requested:
+            zyte_api_meta["browserHtml"] = True
 
         api_request = Request(
             url=request.url,
@@ -137,14 +176,47 @@
         if BrowserHtml in to_provide:
             results.append(html)
             self.update_cache(request, {BrowserHtml: html})
+
+        browser_response = None
         if BrowserResponse in to_provide:
-            response = BrowserResponse(
+            browser_response = BrowserResponse(
                 url=api_response.url,
                 status=api_response.status,
                 html=html,
             )
-            results.append(response)
-            self.update_cache(request, {BrowserResponse: response})
+            results.append(browser_response)
+            self.update_cache(request, {BrowserResponse: browser_response})
+
+        if AnyResponse in to_provide:
+            any_response = None  # type: ignore[assignment]
+
+            if "browserHtml" in api_response.raw_api_response:
+                any_response = AnyResponse(
+                    response=browser_response
+                    or BrowserResponse(
+                        url=api_response.url,
+                        status=api_response.status,
+                        html=html,
+                    )
+                )
+            elif (
+                "httpResponseBody" in api_response.raw_api_response
+                and "httpResponseHeaders" in api_response.raw_api_response
+            ):
+                any_response = AnyResponse(
+                    response=HttpResponse(
+                        url=api_response.url,
+                        body=api_response.body,
+                        status=api_response.status,
+                        headers=HttpResponseHeaders.from_bytes_dict(
+                            api_response.headers
+                        ),
+                    )
+                )
+
+            if any_response:
+                results.append(any_response)
+                self.update_cache(request, {AnyResponse: any_response})
 
         for cls in to_provide:
             cls_stripped = strip_annotated(cls)
@@ -153,7 +225,7 @@
             if not kw:
                 continue
             assert issubclass(cls_stripped, Item)
-            item = cls_stripped.from_dict(api_response.raw_api_response[kw])
+            item = cls_stripped.from_dict(api_response.raw_api_response[kw])  # type: ignore[attr-defined]
             if is_typing_annotated(cls):
                 item = AnnotatedResult(item, cls.__metadata__)  # type: ignore[attr-defined]
             results.append(item)

diff --git a/setup.py b/setup.py
@@ -31,8 +31,10 @@ def get_version():
         # Sync with [testenv:pinned-provider] @ tox.ini
         "provider": [
             "andi>=0.6.0",
-            "scrapy-poet>=0.19.0",
-            "web-poet>=0.15.1",
+            # "scrapy-poet>=0.19.0",
+            "scrapy-poet @ git+https://[email protected]/scrapinghub/scrapy-poet@weak-cache#egg=scrapy-poet",
+            # "web-poet>=0.15.1",
+            "web-poet @ git+https://[email protected]/scrapinghub/web-poet@response#egg=web-poet",
             "zyte-common-items>=0.8.0",
         ]
     },