Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reword param descriptions. #65

Merged
merged 3 commits into from
Oct 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 28 additions & 15 deletions tests/test_ecommerce.py
Original file line number Diff line number Diff line change
Expand Up @@ -426,39 +426,52 @@ def test_metadata():
"enumMeta": {
"automatic": {
"description": (
"Automatically use the best crawl strategy based on the given "
"URL inputs. If given a homepage URL, it would attempt to crawl "
"as many products it can discover. Otherwise, it attempt to "
"crawl the products on a given page category."
"Automatically select the best approach. A good "
"default for most use cases. Currently it uses "
"heuristics only on the homepages of websites (similar "
"to Full strategy), and follows product, category and "
"pagination links on other pages (similar to Navigation "
"strategy)."
),
"title": "Automatic",
},
"direct_item": {
"description": (
"Treat input URLs as direct links to product detail pages, and "
"extract a product from each."
"Directly extract products from the provided URLs, "
"without any crawling. To use this strategy, pass "
"individual product URLs to the spider, not the "
"website or product category URLs. Common use cases "
"are product monitoring and batch extraction."
),
"title": "Direct URLs to Product",
},
"full": {
"description": (
"Follow most links within the domain of URL in an attempt "
"to discover and extract as many products as possible."
"Follow most links on the website to discover and "
"extract as many products as possible. If an input URL "
"is a link to a particular category on a website, the "
"spider may crawl products outside this category. Try "
"this strategy if other strategies miss items."
),
"title": "Full",
},
"navigation": {
"description": (
"Follow pagination, subcategories, and product detail "
"pages. Pagination Only is a better choice if the target "
"URL does not have subcategories, or if Zyte API is "
"misidentifying some URLs as subcategories."
"Follow pagination, subcategories, and product links "
"only. If an input URL is a link to a particular "
"category on a website, the spider will try to stay "
"within this category."
),
"title": "Navigation",
},
"pagination_only": {
"description": (
"Follow pagination and product detail pages. Subcategory links are ignored."
"Follow pagination and product links only. This "
"strategy is similar to Navigation, but it doesn't "
"support subcategories. Use it when you need the "
"spider to stay within a certain category on a "
"website, but Automatic or Navigation strategies fail "
"to do so because of misclassified subcategory links."
),
"title": "Pagination Only",
},
Expand Down Expand Up @@ -517,11 +530,11 @@ def test_metadata():
),
"enumMeta": {
"browserHtml": {
"description": "Use browser rendering. Often provides the best quality.",
"description": "Use browser rendering. Better quality, but slower and more expensive.",
"title": "browserHtml",
},
"httpResponseBody": {
"description": "Use HTTP responses. Cost-efficient and fast extraction method, which works well on many websites.",
"description": "Use raw responses. Fast and cheap.",
"title": "httpResponseBody",
},
},
Expand Down
9 changes: 4 additions & 5 deletions zyte_spider_templates/params.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,10 @@
@document_enum
class ExtractFrom(str, Enum):
httpResponseBody: str = "httpResponseBody"
"""Use HTTP responses. Cost-efficient and fast extraction method, which
works well on many websites."""
"""Use raw responses. Fast and cheap."""

browserHtml: str = "browserHtml"
"""Use browser rendering. Often provides the best quality."""
"""Use browser rendering. Better quality, but slower and more expensive."""


@document_enum
Expand All @@ -65,11 +64,11 @@ class ExtractFromParam(BaseModel):
"enumMeta": {
ExtractFrom.browserHtml: {
"title": "browserHtml",
"description": "Use browser rendering. Often provides the best quality.",
"description": "Use browser rendering. Better quality, but slower and more expensive.",
},
ExtractFrom.httpResponseBody: {
"title": "httpResponseBody",
"description": "Use HTTP responses. Cost-efficient and fast extraction method, which works well on many websites.",
"description": "Use raw responses. Fast and cheap.",
},
},
},
Expand Down
79 changes: 52 additions & 27 deletions zyte_spider_templates/spiders/ecommerce.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,30 +40,42 @@
class EcommerceCrawlStrategy(str, Enum):
automatic: str = "automatic"
"""
Automatically use the best crawl strategy based on the given URL inputs.

If given a homepage URL, it would attempt to crawl as many products it can discover.
Otherwise, it attempt to crawl the products on a given page category.
Automatically select the best approach. A good default for most use cases.
Currently it uses heuristics only on the homepages of websites (similar to
Full strategy), and follows product, category and pagination links on other
pages (similar to Navigation strategy).
"""

full: str = "full"
"""Follow most links within the domain of URL in an attempt to discover and
extract as many products as possible."""
"""
Follow most links on the website to discover and extract as many products
as possible. If an input URL is a link to a particular category on a
website, the spider may crawl products outside this category. Try this
strategy if other strategies miss items.
"""

navigation: str = "navigation"
"""Follow pagination, subcategories, and product detail pages.

Pagination Only is a better choice if the target URL does not have
subcategories, or if Zyte API is misidentifying some URLs as subcategories.
"""
Follow pagination, subcategories, and product links only. If an input URL
is a link to a particular category on a website, the spider will try to
stay within this category.
"""

pagination_only: str = "pagination_only"
"""Follow pagination and product detail pages. Subcategory links are
ignored."""
"""
Follow pagination and product links only. This strategy is similar to
Navigation, but it doesn't support subcategories. Use it when you need the
spider to stay within a certain category on a website, but Automatic or
Navigation strategies fail to do so because of misclassified subcategory links.
"""

direct_item: str = "direct_item"
"""Treat input URLs as direct links to product detail pages, and extract an
product from each."""
"""
Directly extract products from the provided URLs, without any crawling. To
use this strategy, pass individual product URLs to the spider, not the
website or product category URLs. Common use cases are product monitoring
and batch extraction.
"""


class EcommerceCrawlStrategyParam(BaseModel):
Expand All @@ -75,40 +87,53 @@ class EcommerceCrawlStrategyParam(BaseModel):
"enumMeta": {
EcommerceCrawlStrategy.automatic: {
"description": (
"Automatically use the best crawl strategy based on the given "
"URL inputs. If given a homepage URL, it would attempt to crawl "
"as many products it can discover. Otherwise, it attempt to "
"crawl the products on a given page category."
"Automatically select the best approach. A good "
"default for most use cases. Currently it uses "
"heuristics only on the homepages of websites (similar "
"to Full strategy), and follows product, category and "
"pagination links on other pages (similar to Navigation "
"strategy)."
),
"title": "Automatic",
},
EcommerceCrawlStrategy.full: {
"title": "Full",
"description": (
"Follow most links within the domain of URL in an attempt to "
"discover and extract as many products as possible."
"Follow most links on the website to discover and "
"extract as many products as possible. If an input URL "
"is a link to a particular category on a website, the "
"spider may crawl products outside this category. Try "
"this strategy if other strategies miss items."
),
},
EcommerceCrawlStrategy.navigation: {
"title": "Navigation",
"description": (
"Follow pagination, subcategories, and product detail pages. "
"Pagination Only is a better choice if the target URL does not "
"have subcategories, or if Zyte API is misidentifying some URLs "
"as subcategories."
"Follow pagination, subcategories, and product links "
"only. If an input URL is a link to a particular "
"category on a website, the spider will try to stay "
"within this category."
),
},
EcommerceCrawlStrategy.pagination_only: {
"title": "Pagination Only",
"description": (
"Follow pagination and product detail pages. Subcategory links are ignored."
"Follow pagination and product links only. This "
"strategy is similar to Navigation, but it doesn't "
"support subcategories. Use it when you need the "
"spider to stay within a certain category on a "
"website, but Automatic or Navigation strategies fail "
"to do so because of misclassified subcategory links."
),
},
EcommerceCrawlStrategy.direct_item: {
"title": "Direct URLs to Product",
"description": (
"Treat input URLs as direct links to product detail pages, and "
"extract a product from each."
"Directly extract products from the provided URLs, "
"without any crawling. To use this strategy, pass "
"individual product URLs to the spider, not the "
"website or product category URLs. Common use cases "
"are product monitoring and batch extraction."
),
},
},
Expand Down
Loading