Merge pull request #38 from sgibson91/expose-ext-var

Living-with-machines · Feb 28, 2022 · 112fa62 · 112fa62
2 parents fece823 + bf9fd6a
commit 112fa62
Show file tree

Hide file tree

Showing 5 changed files with 34 additions and 13 deletions.
diff --git a/README.md b/README.md
@@ -78,10 +78,13 @@ Each document in the generated database can be described as a dictionary with th
 }
 ```
 
+By default, `deduplify` generates hashes for all files under a directory.
+But one or more specific file extensions to search for can be specified using the `--ext` flag.
+
 **Command line usage:**
 
 ```bash
-usage: deduplify hash [-h] [-c COUNT] [-v] [-f DBFILE] [--restart] dir
+usage: deduplify hash [-h] [-c COUNT] [-v] [-f DBFILE] [--exts [EXTS]] [--restart] dir
 
 positional arguments:
   dir                   Path to directory to begin search from
@@ -93,8 +96,9 @@ optional arguments:
   -v, --verbose         Print logging messages to the console
   -f DBFILE, --dbfile DBFILE
                         Destination database for file hashes. Must be a JSON file. Default: file_hashes.json
-  --restart             Restart a run of hashing files and skip over files that have already been hashed. Output files containing duplicated and
-                        unique filenames must already exist.
+  --exts [EXTS]         A list of file extensions to search for.
+  --restart             Restart a run of hashing files and skip over files that have already been hashed. Output file containing a database of
+                        filenames and hashes must already exist.
 ```
 
 ### Comparing files

diff --git a/deduplify/_version.py b/deduplify/_version.py
@@ -7,5 +7,5 @@
 
 from incremental import Version
 
-__version__ = Version("deduplify", 0, 3, 0)
+__version__ = Version("deduplify", 0, 4, 0)
 __all__ = ["__version__"]
diff --git a/deduplify/cli.py b/deduplify/cli.py
@@ -82,10 +82,12 @@ def parse_args(args):
         "-f",
         "--dbfile",
         type=resolvepath,
-        dest="dbfile",
         default="file_hashes.json",
         help="Destination database for file hashes. Must be a JSON file. Default: file_hashes.json",
     )
+    parser.add_argument(
+        "--exts", nargs="?", help="A list of file extensions to search for."
+    )
     parser_hash.add_argument(
         "--restart",
         action="store_true",

diff --git a/deduplify/hash_files.py b/deduplify/hash_files.py
@@ -23,24 +23,27 @@
 EXPANDED_USER = os.path.expanduser("~")
 
 
-def get_total_number_of_files(target_dir: str, file_ext: str = "*") -> int:
+def get_total_number_of_files(target_dir: str, file_ext: list = ["*"]) -> int:
     """Count the total number of files of a given extension in a directory.
 
     Args:
         target_dir (str): The target directory to search.
-        file_ext (str): The file extension to search for. Default: all extensions.
+        file_ext (list[str]): A list of file extensions to search for. Default: all
+            extensions (['*']).
 
     Returns:
         int: The number of files with the matching extension within the tree
             of the target directory
     """
     logger.info("Calculating number of files that will be hashed in %s" % target_dir)
 
-    output = len(fnmatch.filter(os.listdir(target_dir), f"*.{file_ext}"))
+    num_of_files = 0
+    for ext in file_ext:
+        num_of_files += len(fnmatch.filter(os.listdir(target_dir), f"*.{ext}"))
 
-    logger.info(f"{output} files to be hashed in {target_dir}")
+    logger.info(f"{num_of_files} files to be hashed in {target_dir}")
 
-    return output
+    return num_of_files
 
 
 def hashfile(path: str, blocksize: int = 65536) -> Tuple[str, str]:
@@ -118,7 +121,14 @@ def restart_run(db) -> list:
     return [os.path.basename(row["filepath"]) for row in db.all()]
 
 
-def run_hash(dir: str, count: int, dbfile: str, restart: bool = False, **kwargs):
+def run_hash(
+    dir: str,
+    count: int,
+    dbfile: str,
+    restart: bool = False,
+    file_ext: list = ["*"],
+    **kwargs,
+):
     """Hash files within a directory structure
 
     Args:
@@ -128,14 +138,16 @@ def run_hash(dir: str, count: int, dbfile: str, restart: bool = False, **kwargs)
         restart (bool): If true, will restart a hash run. dupfile and unfile
             must exist since the filenames already hashed will be skipped.
             Default: False.
+        file_ext (list[str]): A list of file extensions to search for. Default: all
+            extensions (['*']).
     """
     # Check the directory path exists
     if not os.path.exists(dir):
         raise ValueError("Please provide a known filepath!")
 
     hashes_db = TinyDB(dbfile)
 
-    total_file_num = get_total_number_of_files(dir)
+    total_file_num = get_total_number_of_files(dir, file_ext)
 
     if restart:
         files_to_skip = restart_run(hashes_db)
@@ -154,6 +166,7 @@ def run_hash(dir: str, count: int, dbfile: str, restart: bool = False, **kwargs)
                 executor.submit(hashfile, os.path.join(dirName, filename))
                 for filename in fileList
                 if filename not in files_to_skip
+                if os.path.splitext(filename)[1] in file_ext
             ]
             for future in as_completed(futures):
                 hash, filepath = future.result()

diff --git a/tests/test_hash.py b/tests/test_hash.py
@@ -15,10 +15,12 @@ def test_get_total_number_of_files():
     dirpath = os.path.join("tests", "testdir")
 
     output1 = get_total_number_of_files(dirpath)
-    output2 = get_total_number_of_files(dirpath, file_ext="txt")
+    output2 = get_total_number_of_files(dirpath, file_ext=["txt"])
+    output3 = get_total_number_of_files(dirpath, file_ext=["txt", "xml"])
 
     assert output1 == 3
     assert output2 == 1
+    assert output3 == 3
 
 
 def test_hashfile():