Skip to content

Commit

Permalink
[etsy] upgrade scrape config and except errors
Browse files Browse the repository at this point in the history
  • Loading branch information
mazen-r committed Nov 5, 2024
1 parent d139261 commit 3f908cf
Showing 1 changed file with 54 additions and 25 deletions.
79 changes: 54 additions & 25 deletions etsy-scraper/etsy.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
To run this scraper set env variable $SCRAPFLY_KEY with your scrapfly API key:
$ export $SCRAPFLY_KEY="your key from https://scrapfly.io/dashboard"
"""

import os
import math
import json
Expand Down Expand Up @@ -42,30 +43,39 @@ def parse_search(response: ScrapeApiResponse) -> Dict:
number_of_reviews = strip_text(product.xpath(".//div[contains(@aria-label,'star rating')]/p/text()").get())
if number_of_reviews:
number_of_reviews = number_of_reviews.replace("(", "").replace(")", "")
number_of_reviews = int(number_of_reviews.replace("k", "").replace(".", "")) * 10 if "k" in number_of_reviews else number_of_reviews
number_of_reviews = (
int(number_of_reviews.replace("k", "").replace(".", "")) * 10
if "k" in number_of_reviews
else number_of_reviews
)
price = product.xpath(".//span[@class='currency-value']/text()").get()
original_price = product.xpath(".//span[contains(text(),'Original Price')]/text()").get()
discount = strip_text(product.xpath(".//span[contains(text(),'off')]/text()").get())
seller = product.xpath(".//span[contains(text(),'From shop')]/text()").get()
currency = product.xpath(".//span[@class='currency-symbol']/text()").get()
data.append({
"productLink": '/'.join(link.split('/')[:5]) if link else None,
"productTitle": strip_text(product.xpath(".//h3[contains(@class, 'v2-listing-card__titl')]/@title").get()),
"productImage": product.xpath("//img[@data-listing-card-listing-image]/@src").get(),
"seller": seller.replace("From shop ", "") if seller else None,
"listingType": "Paid listing" if product.xpath(".//span[@data-ad-label='Ad by Etsy seller']") else "Free listing",
"productRate": float(rate.strip()) if rate else None,
"numberOfReviews": int(number_of_reviews) if number_of_reviews else None,
"freeShipping": "Yes" if product.xpath(".//span[contains(text(),'Free shipping')]/text()").get() else "No",
"productPrice": float(price.replace(",", "")) if price else None,
"priceCurrency": currency,
"originalPrice": float(original_price.split(currency)[-1].strip()) if original_price else "No discount",
"discount": discount if discount else "No discount",
})
return {
"search_data": data,
"total_pages": total_pages
}
data.append(
{
"productLink": "/".join(link.split("/")[:5]) if link else None,
"productTitle": strip_text(
product.xpath(".//h3[contains(@class, 'v2-listing-card__titl')]/@title").get()
),
"productImage": product.xpath("//img[@data-listing-card-listing-image]/@src").get(),
"seller": seller.replace("From shop ", "") if seller else None,
"listingType": (
"Paid listing" if product.xpath(".//span[@data-ad-label='Ad by Etsy seller']") else "Free listing"
),
"productRate": float(rate.strip()) if rate else None,
"numberOfReviews": int(number_of_reviews) if number_of_reviews else None,
"freeShipping": (
"Yes" if product.xpath(".//span[contains(text(),'Free shipping')]/text()").get() else "No"
),
"productPrice": float(price.replace(",", "")) if price else None,
"priceCurrency": currency,
"originalPrice": float(original_price.split(currency)[-1].strip()) if original_price else "No discount",
"discount": discount if discount else "No discount",
}
)
return {"search_data": data, "total_pages": total_pages}


def parse_product_page(response: ScrapeApiResponse) -> Dict:
Expand All @@ -88,7 +98,15 @@ async def scrape_search(url: str, max_pages: int = None) -> List[Dict]:
"""scrape product listing data from Etsy search pages"""
log.info("scraping the first search page")
# etsy search pages are dynaminc, requiring render_js enabled
first_page = await SCRAPFLY.async_scrape(ScrapeConfig(url, wait_for_selector="//div[@data-search-pagination]", render_js=True, **BASE_CONFIG))
first_page = await SCRAPFLY.async_scrape(
ScrapeConfig(
url,
wait_for_selector="//div[@data-search-pagination]",
render_js=True,
proxy_pool="public_residential_pool",
**BASE_CONFIG,
)
)
data = parse_search(first_page)
search_data = data["search_data"]

Expand All @@ -98,15 +116,25 @@ async def scrape_search(url: str, max_pages: int = None) -> List[Dict]:
total_pages = max_pages

log.info(f"scraping search pagination ({total_pages - 1} more pages)")
# add the remaining search pages in a scraping list
# add the remaining search pages in a scraping list
other_pages = [
ScrapeConfig(url + f"&page={page_number}", wait_for_selector="//div[@data-search-pagination]", render_js=True, **BASE_CONFIG)
ScrapeConfig(
url + f"&page={page_number}",
wait_for_selector="//div[@data-search-pagination]",
render_js=True,
proxy_pool="public_residential_pool",
**BASE_CONFIG,
)
for page_number in range(2, total_pages + 1)
]
# scrape the remaining search pages concurrently
async for response in SCRAPFLY.concurrent_scrape(other_pages):
data = parse_search(response)
search_data.extend(data["search_data"])
try:
data = parse_search(response)
search_data.extend(data["search_data"])
except Exception as e:
log.error(f"failed to scrape search page: {e}")
pass
log.success(f"scraped {len(search_data)} product listings from search")
return search_data

Expand All @@ -123,6 +151,7 @@ async def scrape_product(urls: List[str]) -> List[Dict]:
log.success(f"scraped {len(products)} product listings from product pages")
return products


async def scrape_shop(urls: List[str]) -> List[Dict]:
shops = []
# add the shop page URLs to a scraping list
Expand All @@ -132,4 +161,4 @@ async def scrape_shop(urls: List[str]) -> List[Dict]:
data = parse_shop_page(response)
shops.append(data)
log.success(f"scraped {len(shops)} shops from shop pages")
return shops
return shops

0 comments on commit 3f908cf

Please sign in to comment.