diff --git a/tests/test_ecommerce.py b/tests/test_ecommerce.py index 23bb15a..69d9466 100644 --- a/tests/test_ecommerce.py +++ b/tests/test_ecommerce.py @@ -426,39 +426,52 @@ def test_metadata(): "enumMeta": { "automatic": { "description": ( - "Automatically use the best crawl strategy based on the given " - "URL inputs. If given a homepage URL, it would attempt to crawl " - "as many products it can discover. Otherwise, it attempt to " - "crawl the products on a given page category." + "Automatically select the best approach. A good " + "default for most use cases. Currently it uses " + "heuristics only on the homepages of websites (similar " + "to Full strategy), and follows product, category and " + "pagination links on other pages (similar to Navigation " + "strategy)." ), "title": "Automatic", }, "direct_item": { "description": ( - "Treat input URLs as direct links to product detail pages, and " - "extract a product from each." + "Directly extract products from the provided URLs, " + "without any crawling. To use this strategy, pass " + "individual product URLs to the spider, not the " + "website or product category URLs. Common use cases " + "are product monitoring and batch extraction." ), "title": "Direct URLs to Product", }, "full": { "description": ( - "Follow most links within the domain of URL in an attempt " - "to discover and extract as many products as possible." + "Follow most links on the website to discover and " + "extract as many products as possible. If an input URL " + "is a link to a particular category on a website, the " + "spider may crawl products outside this category. Try " + "this strategy if other strategies miss items." ), "title": "Full", }, "navigation": { "description": ( - "Follow pagination, subcategories, and product detail " - "pages. Pagination Only is a better choice if the target " - "URL does not have subcategories, or if Zyte API is " - "misidentifying some URLs as subcategories." + "Follow pagination, subcategories, and product links " + "only. If an input URL is a link to a particular " + "category on a website, the spider will try to stay " + "within this category." ), "title": "Navigation", }, "pagination_only": { "description": ( - "Follow pagination and product detail pages. Subcategory links are ignored." + "Follow pagination and product links only. This " + "strategy is similar to Navigation, but it doesn't " + "support subcategories. Use it when you need the " + "spider to stay within a certain category on a " + "website, but Automatic or Navigation strategies fail " + "to do so because of misclassified subcategory links." ), "title": "Pagination Only", }, @@ -517,11 +530,11 @@ def test_metadata(): ), "enumMeta": { "browserHtml": { - "description": "Use browser rendering. Often provides the best quality.", + "description": "Use browser rendering. Better quality, but slower and more expensive.", "title": "browserHtml", }, "httpResponseBody": { - "description": "Use HTTP responses. Cost-efficient and fast extraction method, which works well on many websites.", + "description": "Use raw responses. Fast and cheap.", "title": "httpResponseBody", }, }, diff --git a/zyte_spider_templates/params.py b/zyte_spider_templates/params.py index 688d7e8..e74f3f8 100644 --- a/zyte_spider_templates/params.py +++ b/zyte_spider_templates/params.py @@ -34,11 +34,10 @@ @document_enum class ExtractFrom(str, Enum): httpResponseBody: str = "httpResponseBody" - """Use HTTP responses. Cost-efficient and fast extraction method, which - works well on many websites.""" + """Use raw responses. Fast and cheap.""" browserHtml: str = "browserHtml" - """Use browser rendering. Often provides the best quality.""" + """Use browser rendering. Better quality, but slower and more expensive.""" @document_enum @@ -65,11 +64,11 @@ class ExtractFromParam(BaseModel): "enumMeta": { ExtractFrom.browserHtml: { "title": "browserHtml", - "description": "Use browser rendering. Often provides the best quality.", + "description": "Use browser rendering. Better quality, but slower and more expensive.", }, ExtractFrom.httpResponseBody: { "title": "httpResponseBody", - "description": "Use HTTP responses. Cost-efficient and fast extraction method, which works well on many websites.", + "description": "Use raw responses. Fast and cheap.", }, }, }, diff --git a/zyte_spider_templates/spiders/ecommerce.py b/zyte_spider_templates/spiders/ecommerce.py index a799fb9..5e87266 100644 --- a/zyte_spider_templates/spiders/ecommerce.py +++ b/zyte_spider_templates/spiders/ecommerce.py @@ -40,30 +40,42 @@ class EcommerceCrawlStrategy(str, Enum): automatic: str = "automatic" """ - Automatically use the best crawl strategy based on the given URL inputs. - - If given a homepage URL, it would attempt to crawl as many products it can discover. - Otherwise, it attempt to crawl the products on a given page category. + Automatically select the best approach. A good default for most use cases. + Currently it uses heuristics only on the homepages of websites (similar to + Full strategy), and follows product, category and pagination links on other + pages (similar to Navigation strategy). """ full: str = "full" - """Follow most links within the domain of URL in an attempt to discover and - extract as many products as possible.""" + """ + Follow most links on the website to discover and extract as many products + as possible. If an input URL is a link to a particular category on a + website, the spider may crawl products outside this category. Try this + strategy if other strategies miss items. + """ navigation: str = "navigation" - """Follow pagination, subcategories, and product detail pages. - - Pagination Only is a better choice if the target URL does not have - subcategories, or if Zyte API is misidentifying some URLs as subcategories. + """ + Follow pagination, subcategories, and product links only. If an input URL + is a link to a particular category on a website, the spider will try to + stay within this category. """ pagination_only: str = "pagination_only" - """Follow pagination and product detail pages. Subcategory links are - ignored.""" + """ + Follow pagination and product links only. This strategy is similar to + Navigation, but it doesn't support subcategories. Use it when you need the + spider to stay within a certain category on a website, but Automatic or + Navigation strategies fail to do so because of misclassified subcategory links. + """ direct_item: str = "direct_item" - """Treat input URLs as direct links to product detail pages, and extract an - product from each.""" + """ + Directly extract products from the provided URLs, without any crawling. To + use this strategy, pass individual product URLs to the spider, not the + website or product category URLs. Common use cases are product monitoring + and batch extraction. + """ class EcommerceCrawlStrategyParam(BaseModel): @@ -75,40 +87,53 @@ class EcommerceCrawlStrategyParam(BaseModel): "enumMeta": { EcommerceCrawlStrategy.automatic: { "description": ( - "Automatically use the best crawl strategy based on the given " - "URL inputs. If given a homepage URL, it would attempt to crawl " - "as many products it can discover. Otherwise, it attempt to " - "crawl the products on a given page category." + "Automatically select the best approach. A good " + "default for most use cases. Currently it uses " + "heuristics only on the homepages of websites (similar " + "to Full strategy), and follows product, category and " + "pagination links on other pages (similar to Navigation " + "strategy)." ), "title": "Automatic", }, EcommerceCrawlStrategy.full: { "title": "Full", "description": ( - "Follow most links within the domain of URL in an attempt to " - "discover and extract as many products as possible." + "Follow most links on the website to discover and " + "extract as many products as possible. If an input URL " + "is a link to a particular category on a website, the " + "spider may crawl products outside this category. Try " + "this strategy if other strategies miss items." ), }, EcommerceCrawlStrategy.navigation: { "title": "Navigation", "description": ( - "Follow pagination, subcategories, and product detail pages. " - "Pagination Only is a better choice if the target URL does not " - "have subcategories, or if Zyte API is misidentifying some URLs " - "as subcategories." + "Follow pagination, subcategories, and product links " + "only. If an input URL is a link to a particular " + "category on a website, the spider will try to stay " + "within this category." ), }, EcommerceCrawlStrategy.pagination_only: { "title": "Pagination Only", "description": ( - "Follow pagination and product detail pages. Subcategory links are ignored." + "Follow pagination and product links only. This " + "strategy is similar to Navigation, but it doesn't " + "support subcategories. Use it when you need the " + "spider to stay within a certain category on a " + "website, but Automatic or Navigation strategies fail " + "to do so because of misclassified subcategory links." ), }, EcommerceCrawlStrategy.direct_item: { "title": "Direct URLs to Product", "description": ( - "Treat input URLs as direct links to product detail pages, and " - "extract a product from each." + "Directly extract products from the provided URLs, " + "without any crawling. To use this strategy, pass " + "individual product URLs to the spider, not the " + "website or product category URLs. Common use cases " + "are product monitoring and batch extraction." ), }, },