-
Notifications
You must be signed in to change notification settings - Fork 2
/
scrape-images.py
49 lines (35 loc) · 1.24 KB
/
scrape-images.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#!/usr/bin/env python
__author__ = 'kalcho'
import os
from urllib import urlopen, urlretrieve
from bs4 import BeautifulSoup
download_directory = "downloaded"
base_url = "http://pythonwebscraping.com"
def get_absolute_url(base_url, source):
if source.startswith("http://www."):
url = "http://" + source[11:]
elif source.startswith("http://"):
url = source
elif source.startswith("www."):
url = "http://" + source[4:]
else:
url = base_url + "/" + source
if base_url not in url:
return None
return url
def get_download_path(base_url, absolute_url, download_dir):
path = absolute_url.replace("www.", "")
path = path.replace(base_url, "")
path = download_directory + path
directory = os.path.dirname(path)
if not os.path.exists(directory):
os.makedirs(directory)
return path
html = urlopen(base_url)
bsObj = BeautifulSoup(html, "html.parser")
download_list = bsObj.find_all(src=True)
for download in download_list:
file_url = get_absolute_url(base_url, download["src"])
if file_url is not None:
print file_url
urlretrieve(file_url, get_download_path(base_url, file_url, download_directory))