fix the corrupt image bug

fix the bug where wrong image format is downloaded addresses #81 #183 #189 #213 sphinx doc updates
hardikvasa · May 13, 2019 · cf5ff48 · cf5ff48
1 parent b1bfee2
commit cf5ff48
Show file tree

Hide file tree

Showing 13 changed files with 133 additions and 50 deletions.
diff --git a/.gitignore b/.gitignore
@@ -40,6 +40,8 @@ output/*/index.html
 
 # Sphinx
 docs/_build
+docs/.DS_Store
+docs/_static/*
 
 # Cookiecutter
 output/
@@ -49,3 +51,4 @@ downloads/
 
 # Logs
 logs/
+
diff --git a/docs/_static/.DS_Store b/docs/_static/.DS_Store
diff --git a/docs/_static/overrides.css b/docs/_static/overrides.css
@@ -0,0 +1,14 @@
+table.docutils {
+    margin: 1em 0;
+    padding: 0;
+    border: 1px solid blue;
+    background-color: dotted #33B8FF;
+}
+
+table.docutils td, table.docutils th {
+    padding: 1px 8px 1px 5px;
+    border-top: 0;
+    border-left: 0;
+    border-right: 1px dotted #33B8FF;
+    border-bottom: 1px dotted #33B8FF;
+}
diff --git a/docs/arguments.rst b/docs/arguments.rst
@@ -1,5 +1,6 @@
-Arguments
-=========
+===============
+Input Arguments
+===============
 
 +-------------------+-------------+-------------------------------------------------------------------------------------------------------------------------------+
 | Argument          | Short hand  | Description                                                                                                                   |
@@ -237,4 +238,4 @@ Arguments
 | help              | h           | show the help message regarding the usage of the above arguments                                                              |
 +-------------------+-------------+-------------------------------------------------------------------------------------------------------------------------------+
 
-**Note:** If ``single_image`` or ``url`` parameter is not present, then keywords is a mandatory parameter. No other parameters are mandatory.
+**Note:** If ``single_image`` or ``url`` parameter is not present, then keywords is a mandatory parameter. No other parameters are mandatory.
diff --git a/docs/conf.py b/docs/conf.py
@@ -13,14 +13,20 @@
 # import os
 # import sys
 # sys.path.insert(0, os.path.abspath('.'))
+version = '1.0.1'
 
+source_suffix = '.rst'
+master_doc = 'index'
 
 html_static_path = ['_static']
 
 html_context = {
     'css_files': [
         '_static/overrides.css',  # override wide tables in RTD theme
-        ],
+    ],
+	"display_github": True, # Add 'Edit on Github' link instead of 'View page source'
+	"last_updated": True,
+	"commit": False,
      }
 
 # -- Project information -----------------------------------------------------
@@ -58,3 +64,6 @@
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
 html_static_path = ['_static']
+
+html_sidebars = { '**': ['globaltoc.html', 'relations.html', 'searchbox.html'] }
+
diff --git a/docs/contents.rst b/docs/contents.rst
diff --git a/docs/examples.rst b/docs/examples.rst
@@ -1,3 +1,7 @@
+========
+Examples
+========
+
 Config File Format
 ==================
 
@@ -148,4 +152,18 @@ Command line examples
 Library extensions
 ==================
 
-Coming soon!
+The downloading algorithm does a good job of keeping out corrupt images. However it is not ideal. There are still some chances of getting one-off corrupt image that cannot be used for processing. Below script will help clean those corrupt image files. This script was ideated by @devajith in `Issue 81 <https://github.com/hardikvasa/google-images-download/issues/81>`__.
+
+.. code:: python
+
+    import os
+    from PIL import Image
+
+    img_dir = r"path/to/downloads/directory"
+    for filename in os.listdir(img_dir):
+        try :
+            with Image.open(img_dir + "/" + filename) as im:
+                 print('ok')
+        except :
+            print(img_dir + "/" + filename)
+            os.remove(img_dir + "/" + filename)
diff --git a/docs/index.rst b/docs/index.rst
@@ -1,5 +1,11 @@
+======================
+Google Images Download
+======================
+
+.. index:: Summary
+
 Summary
--------
+=======
 
 This is a command line python program to search keywords/key-phrases on Google Images
 and optionally download images to your computer. You can also invoke this script from
@@ -11,57 +17,67 @@ images** per keyword, then you would need to install ``Selenium`` library along
 Detailed instructions in the troubleshooting section.
 
 
+.. index:: Compatability
+
 Compatibility
--------------
+=============
 
 This program is compatible with both the versions of python - 2.x and 3.x (recommended).
 It is a download-and-run program with no changes to the file.
 You will just have to specify parameters through the command line.
 
+.. index:: Installation
 
 Installation
-------------
+============
 
 The guide provides detailed instructions on how to install the library.
 
 .. toctree::
-   :maxdepth: 2
+   :maxdepth: 3
 
    installation
 
+.. index:: Usage
 
 Usage
------
+=====
 
 The following section provides details on using the library - from CLI or by standard imports.
 
 .. toctree::
-   :maxdepth: 2
+   :maxdepth: 3
 
    usage
 
+.. index:: Arguments
+
 Arguments
----------
+=========
 
 This section provides all the arguments/parameters/options you can provide to this library.
 
 .. toctree::
-   :maxdepth: 2
+   :maxdepth: 3
 
    arguments
 
+.. index:: Examples
+
 Examples
---------
+========
 
 Many examples have been provided to help new users quickly ramp up the the usage.
 
 .. toctree::
-   :maxdepth: 2
+   :maxdepth: 3
 
    examples
 
+.. index:: Troubleshooting
+
 Troubleshooting
----------------
+===============
 
 This section proviedes troubleshooting guide for commonly seen issues.
 
@@ -70,8 +86,10 @@ This section proviedes troubleshooting guide for commonly seen issues.
 
    troubleshooting
 
+.. index:: Workflow
+
 Workflow
---------
+========
 
 Workflow showcases the algorithm used within this module to download the images.
 
@@ -80,9 +98,10 @@ Workflow showcases the algorithm used within this module to download the images.
 
    structure
 
+.. index:: Contribute
 
 Contribute
-----------
+==========
 
 Anyone is welcomed to contribute to this script.
 If you would like to make a change, open a pull request.
@@ -91,19 +110,22 @@ For issues and discussion visit the
 
 The aim of this repo is to keep it simple, stand-alone, backward compatible and 3rd party dependency proof.
 
+.. index:: Disclaimer
 
 Disclaimer
-----------
-
-This program lets you download tons of images from Google.
-Please do not download or use any image that violates its copyright terms.
-Google Images is a search engine that merely indexes images and allows you to find them.
-It does NOT produce its own images and, as such, it doesn't own copyright on any of them.
-The original creators of the images own the copyrights.
-
-Images published in the United States are automatically copyrighted by their owners,
-even if they do not explicitly carry a copyright warning.
-You may not reproduce copyright images without their owner's permission,
-except in "fair use" cases,
-or you could risk running into lawyer's warnings, cease-and-desist letters, and copyright suits.
-Please be very careful before its usage!
+==========
+
+.. warning::
+
+   This program lets you download tons of images from Google.
+   Please do not download or use any image that violates its copyright terms.
+   Google Images is a search engine that merely indexes images and allows you to find them.
+   It does NOT produce its own images and, as such, it doesn't own copyright on any of them.
+   The original creators of the images own the copyrights.
+
+   Images published in the United States are automatically copyrighted by their owners,
+   even if they do not explicitly carry a copyright warning.
+   You may not reproduce copyright images without their owner's permission,
+   except in "fair use" cases,
+   or you could risk running into lawyer's warnings, cease-and-desist letters, and copyright suits.
+   Please be very careful before its usage!
diff --git a/docs/installation.rst b/docs/installation.rst
@@ -1,5 +1,4 @@
-.. _installation:
-
+============
 Installation
 ============
 

diff --git a/docs/structure.rst b/docs/structure.rst
@@ -1,5 +1,6 @@
-Structure
-=========
+========
+Workflow
+========
 
 Below diagram represents the algorithm logic to download images.
 

diff --git a/docs/troubleshooting.rst b/docs/troubleshooting.rst
@@ -1,15 +1,16 @@
+=============================
 Troubleshooting Errors/Issues
 =============================
 
 SSL Errors
-----------
+==========
 
 If you do see SSL errors on Mac for Python 3,
 please go to Finder —> Applications —> Python 3 —> Click on the ‘Install Certificates.command’
 and run the file.
 
 googleimagesdownload: command not found
----------------------------------------
+=======================================
 
 While using the above commands, if you get ``Error: -bash: googleimagesdownload: command not found`` then you have to set the correct path variable.
 
@@ -35,13 +36,13 @@ together they make: ``/Library/Frameworks/Python.framework/Versions/2.7/bin`` wh
 
 
 [Errno 13] Permission denied creating directory 'downloads'
------------------------------------------------------------
+===========================================================
 
 When you run the command, it downloads the images in the current directory (the directory from where you are running the command). If you get permission denied error for creating the `downloads directory`, then move to a directory in which you have the write permission and then run the command again.
 
 
 Permission denied while installing the library
-----------------------------------------------
+==============================================
 
 On MAC and Linux, when you get permission denied when installing the library using pip, try doing a user install.
 
@@ -53,7 +54,7 @@ You can also run pip install as a superuser with ``sudo pip install google_image
 
 
 Installing the chromedriver (with Selenium)
--------------------------------------------
+===========================================
 
 If you would want to download more than 100 images per keyword, then you will need to install 'selenium' library along with 'chromedriver' extension.
 

diff --git a/docs/usage.rst b/docs/usage.rst
@@ -1,3 +1,7 @@
+=====
+Usage
+=====
+
 Using the library from Command Line Interface
 =============================================
 

diff --git a/google_images_download/google_images_download.py b/google_images_download/google_images_download.py
@@ -574,7 +574,7 @@ def download_image_thumbnail(self,image_url,main_directory,dir_name,return_image
 
 
     # Download Images
-    def download_image(self,image_url,image_format,main_directory,dir_name,count,print_urls,socket_timeout,prefix,print_size,no_numbering,no_download,save_source,img_src,silent_mode,thumbnail_only):
+    def download_image(self,image_url,image_format,main_directory,dir_name,count,print_urls,socket_timeout,prefix,print_size,no_numbering,no_download,save_source,img_src,silent_mode,thumbnail_only,format):
         if thumbnail_only:
             return "success", "Skipping image download...", str(image_url[(image_url.rfind('/')) + 1:]), image_url
         if not silent_mode:
@@ -596,16 +596,27 @@ def download_image(self,image_url,image_format,main_directory,dir_name,count,pri
                 data = response.read()
                 response.close()
 
+                extensions = [".jpg", ".jpeg", ".gif", ".png", ".bmp", ".svg", ".webp", ".ico"]
                 # keep everything after the last '/'
                 image_name = str(image_url[(image_url.rfind('/')) + 1:])
-                # if no extension then add it
-                # remove everything after the image name
-                if image_format == "":
-                    image_name = image_name + "." + "jpg"
-                elif image_format == "jpeg":
-                    image_name = image_name[:image_name.find(image_format) + 4]
+                if format:
+                    if not image_format or image_format != format:
+                        download_status = 'fail'
+                        download_message = "Wrong image format returned. Skipping..."
+                        return_image_name = ''
+                        absolute_path = ''
+                        return download_status, download_message, return_image_name, absolute_path
+
+                if image_format == "" or not image_format or "." + image_format not in extensions:
+                    download_status = 'fail'
+                    download_message = "Invalid or missing image format. Skipping..."
+                    return_image_name = ''
+                    absolute_path = ''
+                    return download_status, download_message, return_image_name, absolute_path
+                elif image_name.lower().find("." + image_format) < 0:
+                    image_name = image_name + "." + image_format
                 else:
-                    image_name = image_name[:image_name.find(image_format) + 3]
+                    image_name = image_name[:image_name.lower().find("." + image_format) + (len(image_format) + 1)]
 
                 # prefix name in image
                 if prefix:
@@ -748,7 +759,7 @@ def _get_all_items(self,page,main_directory,dir_name,limit,arguments):
                         print("\nImage Metadata: " + str(object))
 
                 #download the images
-                download_status,download_message,return_image_name,absolute_path = self.download_image(object['image_link'],object['image_format'],main_directory,dir_name,count,arguments['print_urls'],arguments['socket_timeout'],arguments['prefix'],arguments['print_size'],arguments['no_numbering'],arguments['no_download'],arguments['save_source'],object['image_source'],arguments["silent_mode"],arguments["thumbnail_only"])
+                download_status,download_message,return_image_name,absolute_path = self.download_image(object['image_link'],object['image_format'],main_directory,dir_name,count,arguments['print_urls'],arguments['socket_timeout'],arguments['prefix'],arguments['print_size'],arguments['no_numbering'],arguments['no_download'],arguments['save_source'],object['image_source'],arguments["silent_mode"],arguments["thumbnail_only"],arguments['format'])
                 if not arguments["silent_mode"]:
                     print(download_message)
                 if download_status == "success":