[etsy] upgrade scrape config and except errors

scrapfly · Nov 5, 2024 · 3f908cf · 3f908cf
1 parent d139261
commit 3f908cf
Showing 1 changed file with 54 additions and 25 deletions.
diff --git a/etsy-scraper/etsy.py b/etsy-scraper/etsy.py
@@ -4,6 +4,7 @@
 To run this scraper set env variable $SCRAPFLY_KEY with your scrapfly API key:
 $ export $SCRAPFLY_KEY="your key from https://scrapfly.io/dashboard"
 """
+
 import os
 import math
 import json
@@ -42,30 +43,39 @@ def parse_search(response: ScrapeApiResponse) -> Dict:
         number_of_reviews = strip_text(product.xpath(".//div[contains(@aria-label,'star rating')]/p/text()").get())
         if number_of_reviews:
             number_of_reviews = number_of_reviews.replace("(", "").replace(")", "")
-            number_of_reviews = int(number_of_reviews.replace("k", "").replace(".", "")) * 10 if "k" in number_of_reviews else number_of_reviews
+            number_of_reviews = (
+                int(number_of_reviews.replace("k", "").replace(".", "")) * 10
+                if "k" in number_of_reviews
+                else number_of_reviews
+            )
         price = product.xpath(".//span[@class='currency-value']/text()").get()
         original_price = product.xpath(".//span[contains(text(),'Original Price')]/text()").get()
         discount = strip_text(product.xpath(".//span[contains(text(),'off')]/text()").get())
         seller = product.xpath(".//span[contains(text(),'From shop')]/text()").get()
         currency = product.xpath(".//span[@class='currency-symbol']/text()").get()
-        data.append({
-            "productLink": '/'.join(link.split('/')[:5]) if link else None,
-            "productTitle": strip_text(product.xpath(".//h3[contains(@class, 'v2-listing-card__titl')]/@title").get()),
-            "productImage": product.xpath("//img[@data-listing-card-listing-image]/@src").get(),
-            "seller": seller.replace("From shop ", "") if seller else None,
-            "listingType": "Paid listing" if product.xpath(".//span[@data-ad-label='Ad by Etsy seller']") else "Free listing",
-            "productRate": float(rate.strip()) if rate else None,
-            "numberOfReviews": int(number_of_reviews) if number_of_reviews else None,
-            "freeShipping": "Yes" if product.xpath(".//span[contains(text(),'Free shipping')]/text()").get() else "No",
-            "productPrice": float(price.replace(",", "")) if price else None,
-            "priceCurrency": currency,
-            "originalPrice": float(original_price.split(currency)[-1].strip()) if original_price else "No discount",
-            "discount": discount if discount else "No discount",
-        })
-    return {
-        "search_data": data,
-        "total_pages": total_pages
-    }
+        data.append(
+            {
+                "productLink": "/".join(link.split("/")[:5]) if link else None,
+                "productTitle": strip_text(
+                    product.xpath(".//h3[contains(@class, 'v2-listing-card__titl')]/@title").get()
+                ),
+                "productImage": product.xpath("//img[@data-listing-card-listing-image]/@src").get(),
+                "seller": seller.replace("From shop ", "") if seller else None,
+                "listingType": (
+                    "Paid listing" if product.xpath(".//span[@data-ad-label='Ad by Etsy seller']") else "Free listing"
+                ),
+                "productRate": float(rate.strip()) if rate else None,
+                "numberOfReviews": int(number_of_reviews) if number_of_reviews else None,
+                "freeShipping": (
+                    "Yes" if product.xpath(".//span[contains(text(),'Free shipping')]/text()").get() else "No"
+                ),
+                "productPrice": float(price.replace(",", "")) if price else None,
+                "priceCurrency": currency,
+                "originalPrice": float(original_price.split(currency)[-1].strip()) if original_price else "No discount",
+                "discount": discount if discount else "No discount",
+            }
+        )
+    return {"search_data": data, "total_pages": total_pages}
 
 
 def parse_product_page(response: ScrapeApiResponse) -> Dict:
@@ -88,7 +98,15 @@ async def scrape_search(url: str, max_pages: int = None) -> List[Dict]:
     """scrape product listing data from Etsy search pages"""
     log.info("scraping the first search page")
     # etsy search pages are dynaminc, requiring render_js enabled
-    first_page = await SCRAPFLY.async_scrape(ScrapeConfig(url, wait_for_selector="//div[@data-search-pagination]", render_js=True, **BASE_CONFIG))
+    first_page = await SCRAPFLY.async_scrape(
+        ScrapeConfig(
+            url,
+            wait_for_selector="//div[@data-search-pagination]",
+            render_js=True,
+            proxy_pool="public_residential_pool",
+            **BASE_CONFIG,
+        )
+    )
     data = parse_search(first_page)
     search_data = data["search_data"]
 
@@ -98,15 +116,25 @@ async def scrape_search(url: str, max_pages: int = None) -> List[Dict]:
         total_pages = max_pages
 
     log.info(f"scraping search pagination ({total_pages - 1} more pages)")
-        # add the remaining search pages in a scraping list
+    # add the remaining search pages in a scraping list
     other_pages = [
-        ScrapeConfig(url + f"&page={page_number}", wait_for_selector="//div[@data-search-pagination]", render_js=True, **BASE_CONFIG)
+        ScrapeConfig(
+            url + f"&page={page_number}",
+            wait_for_selector="//div[@data-search-pagination]",
+            render_js=True,
+            proxy_pool="public_residential_pool",
+            **BASE_CONFIG,
+        )
         for page_number in range(2, total_pages + 1)
     ]
     # scrape the remaining search pages concurrently
     async for response in SCRAPFLY.concurrent_scrape(other_pages):
-        data = parse_search(response)
-        search_data.extend(data["search_data"])
+        try:
+            data = parse_search(response)
+            search_data.extend(data["search_data"])
+        except Exception as e:
+            log.error(f"failed to scrape search page: {e}")
+            pass
     log.success(f"scraped {len(search_data)} product listings from search")
     return search_data
 
@@ -123,6 +151,7 @@ async def scrape_product(urls: List[str]) -> List[Dict]:
     log.success(f"scraped {len(products)} product listings from product pages")
     return products
 
+
 async def scrape_shop(urls: List[str]) -> List[Dict]:
     shops = []
     # add the shop page URLs to a scraping list
@@ -132,4 +161,4 @@ async def scrape_shop(urls: List[str]) -> List[Dict]:
         data = parse_shop_page(response)
         shops.append(data)
     log.success(f"scraped {len(shops)} shops from shop pages")
-    return shops    
+    return shops