-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgetpic.py
83 lines (71 loc) · 2.81 KB
/
getpic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import os
import json
import urllib3
import urllib3.request
import sys
import time
import certifi
"""READ THIS.
This file uses selenium to download images in their original size from google images search
"""
"""Google working requires the following line to authenticate the service"""
http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',ca_certs=certifi.where())
""" adding path to geckodriver to the OS environment variable. Download path is set further."""
os.environ["PATH"] += os.pathsep + os.getcwd()
download_path = "dataset/"
"""The following function takes 'the text to be searched' and 'maximum number of images to be saved' as arguments"""
def mai(searchtext, num_requested):
#searchtext = sys.argv[1]
#num_requested = int(sys.argv[2])
number_of_scrolls = int(num_requested / 400 + 1)
#If the download directory doesn't exists, the following command makes the path
if not os.path.exists(download_path + searchtext.replace(" ", "_")):
os.makedirs(download_path + searchtext.replace(" ", "_"))
#The following lines prepare the actual format of the google search
url = "https://www.google.co.in/search?q="+searchtext+"&source=lnms&tbm=isch"
driver = webdriver.Firefox()
driver.get(url)
headers = {}
headers['User-Agent'] = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
extensions = {"jpg", "jpeg", "png", "gif"}
img_count = 0
downloaded_img_count = 0
for _ in range(number_of_scrolls):
for __ in range(10):
# multiple scrolls needed to show all 400 images
driver.execute_script("window.scrollBy(0, 1000000)")
time.sleep(0.2)
# to load next 400 images
time.sleep(0.5)
try:
driver.find_element_by_xpath("//input[@value='Show more results']").click()
except Exception as e:
print("Less images found:", e)
break
imges = driver.find_elements_by_xpath("//div[@class='rg_meta']")
print("Total images:", len(imges), "\n")
for img in imges:
img_count += 1
img_url = json.loads(img.get_attribute('innerHTML'))["ou"]
img_type = json.loads(img.get_attribute('innerHTML'))["ity"]
#print("Downloading image", img_count, ": ", img_url)
try:
if img_type not in extensions:
img_type = "jpg"
response1 = http.request('GET', img_url)
if sys.getsizeof(response1.data) > 120000:
f = open(download_path+searchtext.replace(" ", "_")+"/"+str(downloaded_img_count)+"."+img_type, "wb")
f.write(response1.data)
f.close
downloaded_img_count += 1
except Exception as e:
#print("Download failed:", e)
pass
if downloaded_img_count >= num_requested:
break
print("Total downloaded: ", downloaded_img_count, "/", img_count)
driver.quit()
if __name__ == "__main__":
main()