diff --git a/docs/reference/index.rst b/docs/reference/index.rst index 4fbf2c7..c077458 100644 --- a/docs/reference/index.rst +++ b/docs/reference/index.rst @@ -10,6 +10,9 @@ Base classes .. autoclass:: zyte_spider_templates.spiders.base.BaseSpider +.. autoenum:: zyte_spider_templates.spiders.base.ExtractFrom + :noindex: + .. autoenum:: zyte_spider_templates.spiders.base.Geolocation :noindex: @@ -23,9 +26,6 @@ E-commerce .. autoenum:: zyte_spider_templates.spiders.ecommerce.EcommerceCrawlStrategy :noindex: -.. autoclass:: zyte_spider_templates.spiders.ecommerce.ExtractFrom - :noindex: - .. autoclass:: zyte_spider_templates.spiders.ecommerce.EcommerceSpider :noindex: diff --git a/docs/templates/e-commerce.rst b/docs/templates/e-commerce.rst index e2a8684..e13d79c 100644 --- a/docs/templates/e-commerce.rst +++ b/docs/templates/e-commerce.rst @@ -19,6 +19,6 @@ Parameters .. autoenum:: zyte_spider_templates.spiders.ecommerce.EcommerceCrawlStrategy -.. autoenum:: zyte_spider_templates.spiders.ecommerce.ExtractFrom +.. autoenum:: zyte_spider_templates.spiders.base.ExtractFrom .. autoenum:: zyte_spider_templates.spiders.base.Geolocation diff --git a/setup.py b/setup.py index ee39b27..13de375 100644 --- a/setup.py +++ b/setup.py @@ -14,9 +14,9 @@ install_requires=[ "pydantic>=2", "scrapy>=2.11.0", - "scrapy-poet>=0.20.1", + "scrapy-poet>=0.21.0", "scrapy-spider-metadata>=0.1.2", - "scrapy-zyte-api[provider]>=0.15.0", + "scrapy-zyte-api[provider]>=0.16.0", "zyte-common-items>=0.13.0", ], classifiers=[ diff --git a/tests/pages/test_product_navigation_heuristics.py b/tests/pages/test_product_navigation_heuristics.py index a0db58b..9fd4250 100644 --- a/tests/pages/test_product_navigation_heuristics.py +++ b/tests/pages/test_product_navigation_heuristics.py @@ -1,6 +1,6 @@ import pytest from pytest_twisted import ensureDeferred -from web_poet import HttpResponse, PageParams, RequestUrl +from web_poet import AnyResponse, HttpResponse, PageParams, RequestUrl from zyte_common_items import ProbabilityRequest, ProductNavigation from zyte_spider_templates.pages.product_navigation_heuristics import ( @@ -38,7 +38,7 @@ async def test_unknown_product_page(): """ - response = HttpResponse("https://example.com", body) + response = AnyResponse(HttpResponse("https://example.com", body)) navigation = ProductNavigation.from_dict( { "url": "https://example.com", @@ -118,7 +118,7 @@ async def test_crawl_nofollow_links(): """ url = "https://example.com" - response = HttpResponse(url, body) + response = AnyResponse(HttpResponse(url, body)) request_url = RequestUrl(response.url) navigation = ProductNavigation(url=url) diff --git a/tox.ini b/tox.ini index 2d0b12c..e788931 100644 --- a/tox.ini +++ b/tox.ini @@ -22,9 +22,9 @@ deps = {[testenv]deps} pydantic==2 scrapy==2.11.0 - scrapy-poet==0.20.1 + scrapy-poet==0.21.0 scrapy-spider-metadata==0.1.2 - scrapy-zyte-api[provider]==0.15.0 + scrapy-zyte-api[provider]==0.16.0 zyte-common-items==0.13.0 [testenv:mypy] @@ -51,4 +51,4 @@ changedir = docs deps = -rdocs/requirements.txt commands = - sphinx-build -W -b html . {envtmpdir}/html \ No newline at end of file + sphinx-build -W -b html . {envtmpdir}/html diff --git a/zyte_spider_templates/pages/product_navigation_heuristics.py b/zyte_spider_templates/pages/product_navigation_heuristics.py index 5d1d38a..bd012ff 100644 --- a/zyte_spider_templates/pages/product_navigation_heuristics.py +++ b/zyte_spider_templates/pages/product_navigation_heuristics.py @@ -3,7 +3,7 @@ import attrs from scrapy.http import TextResponse from scrapy.linkextractors import LinkExtractor -from web_poet import HttpResponse, PageParams, field, handle_urls +from web_poet import AnyResponse, PageParams, field, handle_urls from zyte_common_items import AutoProductNavigationPage, ProbabilityRequest from zyte_spider_templates.heuristics import might_be_category @@ -12,12 +12,7 @@ @handle_urls("") @attrs.define class HeuristicsProductNavigationPage(AutoProductNavigationPage): - # TODO: swap with BrowserResponse after evaluating it. - # Also after when the following issue has been fixed: - # https://github.com/scrapy-plugins/scrapy-zyte-api/issues/91#issuecomment-1744305554 - # NOTE: Even with BrowserResponse, it would still send separate - # requests for it and productNavigation. - response: HttpResponse + response: AnyResponse page_params: PageParams @field @@ -55,7 +50,9 @@ def _probably_category_links(self) -> List[ProbabilityRequest]: ignore_urls = set(self._urls_for_category()) links = [] - response = TextResponse(url=str(self.response.url), body=self.response.body) + response = TextResponse( + url=str(self.response.url), body=self.response.text.encode() + ) for link in link_extractor.extract_links(response): if link.url in ignore_urls: continue diff --git a/zyte_spider_templates/spiders/base.py b/zyte_spider_templates/spiders/base.py index e64faa0..fc322f7 100644 --- a/zyte_spider_templates/spiders/base.py +++ b/zyte_spider_templates/spiders/base.py @@ -1,3 +1,4 @@ +from enum import Enum from importlib.metadata import version from typing import Any, Dict, Optional @@ -9,11 +10,22 @@ GEOLOCATION_OPTIONS_WITH_CODE, Geolocation, ) +from zyte_spider_templates.documentation import document_enum # Higher priority than command-line-defined settings (40). ARG_SETTING_PRIORITY: int = 50 +@document_enum +class ExtractFrom(str, Enum): + httpResponseBody: str = "httpResponseBody" + """Use HTTP responses. Cost-efficient and fast extraction method, which + works well on many websites.""" + + browserHtml: str = "browserHtml" + """Use browser rendering. Often provides the best quality.""" + + class BaseSpiderParams(BaseModel): url: str = Field( title="URL", @@ -48,6 +60,26 @@ class BaseSpiderParams(BaseModel): "widget": "request-limit", }, ) + extract_from: Optional[ExtractFrom] = Field( + title="Extraction source", + description=( + "Whether to perform extraction using a browser request " + "(browserHtml) or an HTTP request (httpResponseBody)." + ), + default=None, + json_schema_extra={ + "enumMeta": { + ExtractFrom.browserHtml: { + "title": "browserHtml", + "description": "Use browser rendering. Often provides the best quality.", + }, + ExtractFrom.httpResponseBody: { + "title": "httpResponseBody", + "description": "Use HTTP responses. Cost-efficient and fast extraction method, which works well on many websites.", + }, + }, + }, + ) class BaseSpider(scrapy.Spider): diff --git a/zyte_spider_templates/spiders/ecommerce.py b/zyte_spider_templates/spiders/ecommerce.py index 4de10ea..b05b8bf 100644 --- a/zyte_spider_templates/spiders/ecommerce.py +++ b/zyte_spider_templates/spiders/ecommerce.py @@ -33,16 +33,6 @@ class EcommerceCrawlStrategy(str, Enum): ML-extraction.""" -@document_enum -class ExtractFrom(str, Enum): - httpResponseBody: str = "httpResponseBody" - """Use HTTP responses. Cost-efficient and fast extraction method, which - works well on many websites.""" - - browserHtml: str = "browserHtml" - """Use browser rendering. Often provides the best quality.""" - - class EcommerceSpiderParams(BaseSpiderParams): crawl_strategy: EcommerceCrawlStrategy = Field( title="Crawl strategy", @@ -68,26 +58,6 @@ class EcommerceSpiderParams(BaseSpiderParams): }, }, ) - extract_from: Optional[ExtractFrom] = Field( - title="Extraction source", - description=( - "Whether to perform extraction using a browser request " - "(browserHtml) or an HTTP request (httpResponseBody)." - ), - default=None, - json_schema_extra={ - "enumMeta": { - ExtractFrom.browserHtml: { - "title": "browserHtml", - "description": "Use browser rendering. Often provides the best quality.", - }, - ExtractFrom.httpResponseBody: { - "title": "httpResponseBody", - "description": "Use HTTP responses. Cost-efficient and fast extraction method, which works well on many websites.", - }, - }, - }, - ) class EcommerceSpider(Args[EcommerceSpiderParams], BaseSpider):