From e4fed50102769d8dd0682d44a41b287e5133ecca Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Mon, 9 Sep 2024 17:05:07 +0500 Subject: [PATCH 1/2] Reword param descriptions. --- tests/test_ecommerce.py | 13 +++++-------- zyte_spider_templates/params.py | 9 ++++----- zyte_spider_templates/spiders/ecommerce.py | 19 ++++++------------- 3 files changed, 15 insertions(+), 26 deletions(-) diff --git a/tests/test_ecommerce.py b/tests/test_ecommerce.py index 4e794fb..21268d0 100644 --- a/tests/test_ecommerce.py +++ b/tests/test_ecommerce.py @@ -466,16 +466,13 @@ def test_metadata(): }, "navigation": { "description": ( - "Follow pagination, subcategories, and product detail " - "pages. Pagination Only is a better choice if the target " - "URL does not have subcategories, or if Zyte API is " - "misidentifying some URLs as subcategories." + "Follow only subcategories, pagination and product detail pages." ), - "title": "Navigation", + "title": "Category", }, "pagination_only": { "description": ( - "Follow pagination and product detail pages. Subcategory links are ignored." + "Follow only pagination and product detail pages. Subcategory links are ignored." ), "title": "Pagination Only", }, @@ -534,11 +531,11 @@ def test_metadata(): ), "enumMeta": { "browserHtml": { - "description": "Use browser rendering. Often provides the best quality.", + "description": "Use browser rendering. Better quality, but slower and more expensive.", "title": "browserHtml", }, "httpResponseBody": { - "description": "Use HTTP responses. Cost-efficient and fast extraction method, which works well on many websites.", + "description": "Use raw responses. Fast and cheap.", "title": "httpResponseBody", }, }, diff --git a/zyte_spider_templates/params.py b/zyte_spider_templates/params.py index ca1158b..54eb900 100644 --- a/zyte_spider_templates/params.py +++ b/zyte_spider_templates/params.py @@ -26,11 +26,10 @@ @document_enum class ExtractFrom(str, Enum): httpResponseBody: str = "httpResponseBody" - """Use HTTP responses. Cost-efficient and fast extraction method, which - works well on many websites.""" + """Use raw responses. Fast and cheap.""" browserHtml: str = "browserHtml" - """Use browser rendering. Often provides the best quality.""" + """Use browser rendering. Better quality, but slower and more expensive.""" class ExtractFromParam(BaseModel): @@ -45,11 +44,11 @@ class ExtractFromParam(BaseModel): "enumMeta": { ExtractFrom.browserHtml: { "title": "browserHtml", - "description": "Use browser rendering. Often provides the best quality.", + "description": "Use browser rendering. Better quality, but slower and more expensive.", }, ExtractFrom.httpResponseBody: { "title": "httpResponseBody", - "description": "Use HTTP responses. Cost-efficient and fast extraction method, which works well on many websites.", + "description": "Use raw responses. Fast and cheap.", }, }, }, diff --git a/zyte_spider_templates/spiders/ecommerce.py b/zyte_spider_templates/spiders/ecommerce.py index eefb43e..187c4df 100644 --- a/zyte_spider_templates/spiders/ecommerce.py +++ b/zyte_spider_templates/spiders/ecommerce.py @@ -45,15 +45,11 @@ class EcommerceCrawlStrategy(str, Enum): extract as many products as possible.""" navigation: str = "navigation" - """Follow pagination, subcategories, and product detail pages. - - Pagination Only is a better choice if the target URL does not have - subcategories, or if Zyte API is misidentifying some URLs as subcategories. - """ + """Follow only subcategories, pagination and product detail pages.""" pagination_only: str = "pagination_only" - """Follow pagination and product detail pages. Subcategory links are - ignored.""" + """Follow only pagination and product detail pages. Subcategory links + are ignored.""" direct_item: str = "direct_item" """Treat input URLs as direct links to product detail pages, and extract an @@ -84,18 +80,15 @@ class EcommerceCrawlStrategyParam(BaseModel): ), }, EcommerceCrawlStrategy.navigation: { - "title": "Navigation", + "title": "Category", "description": ( - "Follow pagination, subcategories, and product detail pages. " - "Pagination Only is a better choice if the target URL does not " - "have subcategories, or if Zyte API is misidentifying some URLs " - "as subcategories." + "Follow only subcategories, pagination and product detail pages." ), }, EcommerceCrawlStrategy.pagination_only: { "title": "Pagination Only", "description": ( - "Follow pagination and product detail pages. Subcategory links are ignored." + "Follow only pagination and product detail pages. Subcategory links are ignored." ), }, EcommerceCrawlStrategy.direct_item: { From 954dd3d0e3ea94108f2107f258fed91b6d684293 Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Fri, 11 Oct 2024 15:13:52 +0500 Subject: [PATCH 2/2] More rewording. --- tests/test_ecommerce.py | 38 +++++++---- zyte_spider_templates/spiders/ecommerce.py | 76 +++++++++++++++------- 2 files changed, 81 insertions(+), 33 deletions(-) diff --git a/tests/test_ecommerce.py b/tests/test_ecommerce.py index 25070d8..69d9466 100644 --- a/tests/test_ecommerce.py +++ b/tests/test_ecommerce.py @@ -426,36 +426,52 @@ def test_metadata(): "enumMeta": { "automatic": { "description": ( - "Automatically use the best crawl strategy based on the given " - "URL inputs. If given a homepage URL, it would attempt to crawl " - "as many products it can discover. Otherwise, it attempt to " - "crawl the products on a given page category." + "Automatically select the best approach. A good " + "default for most use cases. Currently it uses " + "heuristics only on the homepages of websites (similar " + "to Full strategy), and follows product, category and " + "pagination links on other pages (similar to Navigation " + "strategy)." ), "title": "Automatic", }, "direct_item": { "description": ( - "Treat input URLs as direct links to product detail pages, and " - "extract a product from each." + "Directly extract products from the provided URLs, " + "without any crawling. To use this strategy, pass " + "individual product URLs to the spider, not the " + "website or product category URLs. Common use cases " + "are product monitoring and batch extraction." ), "title": "Direct URLs to Product", }, "full": { "description": ( - "Follow most links within the domain of URL in an attempt " - "to discover and extract as many products as possible." + "Follow most links on the website to discover and " + "extract as many products as possible. If an input URL " + "is a link to a particular category on a website, the " + "spider may crawl products outside this category. Try " + "this strategy if other strategies miss items." ), "title": "Full", }, "navigation": { "description": ( - "Follow only subcategories, pagination and product detail pages." + "Follow pagination, subcategories, and product links " + "only. If an input URL is a link to a particular " + "category on a website, the spider will try to stay " + "within this category." ), - "title": "Category", + "title": "Navigation", }, "pagination_only": { "description": ( - "Follow only pagination and product detail pages. Subcategory links are ignored." + "Follow pagination and product links only. This " + "strategy is similar to Navigation, but it doesn't " + "support subcategories. Use it when you need the " + "spider to stay within a certain category on a " + "website, but Automatic or Navigation strategies fail " + "to do so because of misclassified subcategory links." ), "title": "Pagination Only", }, diff --git a/zyte_spider_templates/spiders/ecommerce.py b/zyte_spider_templates/spiders/ecommerce.py index 3ecfb7b..5e87266 100644 --- a/zyte_spider_templates/spiders/ecommerce.py +++ b/zyte_spider_templates/spiders/ecommerce.py @@ -40,26 +40,42 @@ class EcommerceCrawlStrategy(str, Enum): automatic: str = "automatic" """ - Automatically use the best crawl strategy based on the given URL inputs. - - If given a homepage URL, it would attempt to crawl as many products it can discover. - Otherwise, it attempt to crawl the products on a given page category. + Automatically select the best approach. A good default for most use cases. + Currently it uses heuristics only on the homepages of websites (similar to + Full strategy), and follows product, category and pagination links on other + pages (similar to Navigation strategy). """ full: str = "full" - """Follow most links within the domain of URL in an attempt to discover and - extract as many products as possible.""" + """ + Follow most links on the website to discover and extract as many products + as possible. If an input URL is a link to a particular category on a + website, the spider may crawl products outside this category. Try this + strategy if other strategies miss items. + """ navigation: str = "navigation" - """Follow only subcategories, pagination and product detail pages.""" + """ + Follow pagination, subcategories, and product links only. If an input URL + is a link to a particular category on a website, the spider will try to + stay within this category. + """ pagination_only: str = "pagination_only" - """Follow only pagination and product detail pages. Subcategory links - are ignored.""" + """ + Follow pagination and product links only. This strategy is similar to + Navigation, but it doesn't support subcategories. Use it when you need the + spider to stay within a certain category on a website, but Automatic or + Navigation strategies fail to do so because of misclassified subcategory links. + """ direct_item: str = "direct_item" - """Treat input URLs as direct links to product detail pages, and extract an - product from each.""" + """ + Directly extract products from the provided URLs, without any crawling. To + use this strategy, pass individual product URLs to the spider, not the + website or product category URLs. Common use cases are product monitoring + and batch extraction. + """ class EcommerceCrawlStrategyParam(BaseModel): @@ -71,37 +87,53 @@ class EcommerceCrawlStrategyParam(BaseModel): "enumMeta": { EcommerceCrawlStrategy.automatic: { "description": ( - "Automatically use the best crawl strategy based on the given " - "URL inputs. If given a homepage URL, it would attempt to crawl " - "as many products it can discover. Otherwise, it attempt to " - "crawl the products on a given page category." + "Automatically select the best approach. A good " + "default for most use cases. Currently it uses " + "heuristics only on the homepages of websites (similar " + "to Full strategy), and follows product, category and " + "pagination links on other pages (similar to Navigation " + "strategy)." ), "title": "Automatic", }, EcommerceCrawlStrategy.full: { "title": "Full", "description": ( - "Follow most links within the domain of URL in an attempt to " - "discover and extract as many products as possible." + "Follow most links on the website to discover and " + "extract as many products as possible. If an input URL " + "is a link to a particular category on a website, the " + "spider may crawl products outside this category. Try " + "this strategy if other strategies miss items." ), }, EcommerceCrawlStrategy.navigation: { - "title": "Category", + "title": "Navigation", "description": ( - "Follow only subcategories, pagination and product detail pages." + "Follow pagination, subcategories, and product links " + "only. If an input URL is a link to a particular " + "category on a website, the spider will try to stay " + "within this category." ), }, EcommerceCrawlStrategy.pagination_only: { "title": "Pagination Only", "description": ( - "Follow only pagination and product detail pages. Subcategory links are ignored." + "Follow pagination and product links only. This " + "strategy is similar to Navigation, but it doesn't " + "support subcategories. Use it when you need the " + "spider to stay within a certain category on a " + "website, but Automatic or Navigation strategies fail " + "to do so because of misclassified subcategory links." ), }, EcommerceCrawlStrategy.direct_item: { "title": "Direct URLs to Product", "description": ( - "Treat input URLs as direct links to product detail pages, and " - "extract a product from each." + "Directly extract products from the provided URLs, " + "without any crawling. To use this strategy, pass " + "individual product URLs to the spider, not the " + "website or product category URLs. Common use cases " + "are product monitoring and batch extraction." ), }, },