-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape_danube.py
75 lines (73 loc) · 2.79 KB
/
scrape_danube.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import shutil
import csv
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import requests
import time
import re
browser = webdriver.Chrome()
def main():
departments = []
images = []
titles = []
prices = []
base_url = "https://danube.sa"
departments_url = "https://danube.sa/en/departments/"
print("[INFO]: Starting scrape of all danube products. (This may take > hour depending on your internet connection)")
r = requests.get(departments_url)
soup = BeautifulSoup(r.text, 'html.parser')
for dept in soup.find_all('div', class_="department-box"):
dept_name = dept.find('div', class_='department-box__title').text
all_link = base_url + dept.find('a', class_="department-box__all-link").get("href")
page = 1
while(page < 100 and page != -1):
browser.get(all_link+"?page="+str(page))
try:
WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'product-box')))
except TimeoutException as ex:
page = -1
break
all_soup = BeautifulSoup(browser.page_source, 'html.parser')
for product in all_soup.find_all('div', class_="product-box"):
departments.append(dept_name)
product_tile = product.find('div', class_='product-box__name').text
titles.append(product_tile)
product_price = product.find('div', class_='product-price__current-price').text
product_price = re.findall(r"(\d+\.\d{1,2})", product_price)[0]
prices.append(product_price)
image_url = re.findall(r'\(.*?\)', product.find('div', class_="product-box__image__element").get('style'))[0]
image_url = image_url[1: len(image_url)-1]
filename = "images/"+image_url.split("/")[-1]
try:
print(image_url)
img_r = requests.get(image_url)
with open(filename, 'wb') as f:
for chunk in img_r:
f.write(chunk)
images.append(filename)
except Exception as e:
images.append("None")
page+=1
print("[INFO]: Scraping complete.")
print("[INFO]: Writing CSV to file...")
with open('danube_products.csv', 'w', newline='') as csv_file:
fieldnames = ['department', 'image', 'title', 'price']
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
writer.writeheader()
index = 0
while index < len(titles):
try:
writer.writerow({'department': departments[index], 'image': images[index], 'title': titles[index], 'price':prices[index]})
print("INFO [Building CSV]: {:.2f}% complete".format((index/len(titles))*100))
index += 1
except Exception as e:
print("ERROR: {}".format(e))
print("INFO: ERROR AT INDEX {}".format(index))
browser.close()
print("[INFO]: Done")
if __name__ == "__main__":
main()