Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support AnyResponse #28

Merged
merged 13 commits into from
Feb 9, 2024
6 changes: 3 additions & 3 deletions docs/reference/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ Base classes

.. autoclass:: zyte_spider_templates.spiders.base.BaseSpider

.. autoenum:: zyte_spider_templates.spiders.base.ExtractFrom
:noindex:

.. autoenum:: zyte_spider_templates.spiders.base.Geolocation
:noindex:

Expand All @@ -23,9 +26,6 @@ E-commerce
.. autoenum:: zyte_spider_templates.spiders.ecommerce.EcommerceCrawlStrategy
:noindex:

.. autoclass:: zyte_spider_templates.spiders.ecommerce.ExtractFrom
:noindex:

.. autoclass:: zyte_spider_templates.spiders.ecommerce.EcommerceSpider
:noindex:

Expand Down
2 changes: 1 addition & 1 deletion docs/templates/e-commerce.rst
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,6 @@ Parameters

.. autoenum:: zyte_spider_templates.spiders.ecommerce.EcommerceCrawlStrategy

.. autoenum:: zyte_spider_templates.spiders.ecommerce.ExtractFrom
.. autoenum:: zyte_spider_templates.spiders.base.ExtractFrom

.. autoenum:: zyte_spider_templates.spiders.base.Geolocation
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@
install_requires=[
"pydantic>=2",
"scrapy>=2.11.0",
"scrapy-poet>=0.20.1",
"scrapy-poet>=0.21.0",
"scrapy-spider-metadata>=0.1.2",
"scrapy-zyte-api[provider]>=0.15.0",
"scrapy-zyte-api[provider]>=0.16.0",
"zyte-common-items>=0.13.0",
],
classifiers=[
Expand Down
6 changes: 3 additions & 3 deletions tests/pages/test_product_navigation_heuristics.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import pytest
from pytest_twisted import ensureDeferred
from web_poet import HttpResponse, PageParams, RequestUrl
from web_poet import AnyResponse, HttpResponse, PageParams, RequestUrl
from zyte_common_items import ProbabilityRequest, ProductNavigation

from zyte_spider_templates.pages.product_navigation_heuristics import (
Expand Down Expand Up @@ -38,7 +38,7 @@ async def test_unknown_product_page():
</body>
</html>
"""
response = HttpResponse("https://example.com", body)
response = AnyResponse(HttpResponse("https://example.com", body))
navigation = ProductNavigation.from_dict(
{
"url": "https://example.com",
Expand Down Expand Up @@ -118,7 +118,7 @@ async def test_crawl_nofollow_links():
</html>
"""
url = "https://example.com"
response = HttpResponse(url, body)
response = AnyResponse(HttpResponse(url, body))
request_url = RequestUrl(response.url)
navigation = ProductNavigation(url=url)

Expand Down
6 changes: 3 additions & 3 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@ deps =
{[testenv]deps}
pydantic==2
scrapy==2.11.0
scrapy-poet==0.20.1
scrapy-poet==0.21.0
scrapy-spider-metadata==0.1.2
scrapy-zyte-api[provider]==0.15.0
scrapy-zyte-api[provider]==0.16.0
zyte-common-items==0.13.0

[testenv:mypy]
Expand All @@ -51,4 +51,4 @@ changedir = docs
deps =
-rdocs/requirements.txt
commands =
sphinx-build -W -b html . {envtmpdir}/html
sphinx-build -W -b html . {envtmpdir}/html
13 changes: 5 additions & 8 deletions zyte_spider_templates/pages/product_navigation_heuristics.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import attrs
from scrapy.http import TextResponse
from scrapy.linkextractors import LinkExtractor
from web_poet import HttpResponse, PageParams, field, handle_urls
from web_poet import AnyResponse, PageParams, field, handle_urls
from zyte_common_items import AutoProductNavigationPage, ProbabilityRequest

from zyte_spider_templates.heuristics import might_be_category
Expand All @@ -12,12 +12,7 @@
@handle_urls("")
@attrs.define
class HeuristicsProductNavigationPage(AutoProductNavigationPage):
# TODO: swap with BrowserResponse after evaluating it.
# Also after when the following issue has been fixed:
# https://github.com/scrapy-plugins/scrapy-zyte-api/issues/91#issuecomment-1744305554
# NOTE: Even with BrowserResponse, it would still send separate
# requests for it and productNavigation.
response: HttpResponse
response: AnyResponse
page_params: PageParams

@field
Expand Down Expand Up @@ -55,7 +50,9 @@ def _probably_category_links(self) -> List[ProbabilityRequest]:
ignore_urls = set(self._urls_for_category())

links = []
response = TextResponse(url=str(self.response.url), body=self.response.body)
response = TextResponse(
url=str(self.response.url), body=self.response.text.encode()
)
for link in link_extractor.extract_links(response):
if link.url in ignore_urls:
continue
Expand Down
32 changes: 32 additions & 0 deletions zyte_spider_templates/spiders/base.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from enum import Enum
from importlib.metadata import version
from typing import Any, Dict, Optional

Expand All @@ -9,11 +10,22 @@
GEOLOCATION_OPTIONS_WITH_CODE,
Geolocation,
)
from zyte_spider_templates.documentation import document_enum

# Higher priority than command-line-defined settings (40).
ARG_SETTING_PRIORITY: int = 50


@document_enum
class ExtractFrom(str, Enum):
httpResponseBody: str = "httpResponseBody"
"""Use HTTP responses. Cost-efficient and fast extraction method, which
works well on many websites."""

browserHtml: str = "browserHtml"
"""Use browser rendering. Often provides the best quality."""


class BaseSpiderParams(BaseModel):
url: str = Field(
title="URL",
Expand Down Expand Up @@ -48,6 +60,26 @@ class BaseSpiderParams(BaseModel):
"widget": "request-limit",
},
)
extract_from: Optional[ExtractFrom] = Field(
title="Extraction source",
description=(
"Whether to perform extraction using a browser request "
"(browserHtml) or an HTTP request (httpResponseBody)."
),
default=None,
json_schema_extra={
"enumMeta": {
ExtractFrom.browserHtml: {
"title": "browserHtml",
"description": "Use browser rendering. Often provides the best quality.",
},
ExtractFrom.httpResponseBody: {
"title": "httpResponseBody",
"description": "Use HTTP responses. Cost-efficient and fast extraction method, which works well on many websites.",
},
},
},
)


class BaseSpider(scrapy.Spider):
Expand Down
30 changes: 0 additions & 30 deletions zyte_spider_templates/spiders/ecommerce.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,16 +33,6 @@ class EcommerceCrawlStrategy(str, Enum):
ML-extraction."""


@document_enum
class ExtractFrom(str, Enum):
httpResponseBody: str = "httpResponseBody"
"""Use HTTP responses. Cost-efficient and fast extraction method, which
works well on many websites."""

browserHtml: str = "browserHtml"
"""Use browser rendering. Often provides the best quality."""


class EcommerceSpiderParams(BaseSpiderParams):
crawl_strategy: EcommerceCrawlStrategy = Field(
title="Crawl strategy",
Expand All @@ -68,26 +58,6 @@ class EcommerceSpiderParams(BaseSpiderParams):
},
},
)
extract_from: Optional[ExtractFrom] = Field(
title="Extraction source",
description=(
"Whether to perform extraction using a browser request "
"(browserHtml) or an HTTP request (httpResponseBody)."
),
default=None,
json_schema_extra={
"enumMeta": {
ExtractFrom.browserHtml: {
"title": "browserHtml",
"description": "Use browser rendering. Often provides the best quality.",
},
ExtractFrom.httpResponseBody: {
"title": "httpResponseBody",
"description": "Use HTTP responses. Cost-efficient and fast extraction method, which works well on many websites.",
},
},
},
)


class EcommerceSpider(Args[EcommerceSpiderParams], BaseSpider):
Expand Down
Loading