Skip to content
This repository has been archived by the owner on Jan 15, 2025. It is now read-only.

Commit

Permalink
Merge pull request #38 from sgibson91/expose-ext-var
Browse files Browse the repository at this point in the history
  • Loading branch information
sgibson91 authored Feb 28, 2022
2 parents fece823 + bf9fd6a commit 112fa62
Show file tree
Hide file tree
Showing 5 changed files with 34 additions and 13 deletions.
10 changes: 7 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,10 +78,13 @@ Each document in the generated database can be described as a dictionary with th
}
```

By default, `deduplify` generates hashes for all files under a directory.
But one or more specific file extensions to search for can be specified using the `--ext` flag.

**Command line usage:**

```bash
usage: deduplify hash [-h] [-c COUNT] [-v] [-f DBFILE] [--restart] dir
usage: deduplify hash [-h] [-c COUNT] [-v] [-f DBFILE] [--exts [EXTS]] [--restart] dir

positional arguments:
dir Path to directory to begin search from
Expand All @@ -93,8 +96,9 @@ optional arguments:
-v, --verbose Print logging messages to the console
-f DBFILE, --dbfile DBFILE
Destination database for file hashes. Must be a JSON file. Default: file_hashes.json
--restart Restart a run of hashing files and skip over files that have already been hashed. Output files containing duplicated and
unique filenames must already exist.
--exts [EXTS] A list of file extensions to search for.
--restart Restart a run of hashing files and skip over files that have already been hashed. Output file containing a database of
filenames and hashes must already exist.
```
### Comparing files
Expand Down
2 changes: 1 addition & 1 deletion deduplify/_version.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,5 @@

from incremental import Version

__version__ = Version("deduplify", 0, 3, 0)
__version__ = Version("deduplify", 0, 4, 0)
__all__ = ["__version__"]
4 changes: 3 additions & 1 deletion deduplify/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,10 +82,12 @@ def parse_args(args):
"-f",
"--dbfile",
type=resolvepath,
dest="dbfile",
default="file_hashes.json",
help="Destination database for file hashes. Must be a JSON file. Default: file_hashes.json",
)
parser.add_argument(
"--exts", nargs="?", help="A list of file extensions to search for."
)
parser_hash.add_argument(
"--restart",
action="store_true",
Expand Down
27 changes: 20 additions & 7 deletions deduplify/hash_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,24 +23,27 @@
EXPANDED_USER = os.path.expanduser("~")


def get_total_number_of_files(target_dir: str, file_ext: str = "*") -> int:
def get_total_number_of_files(target_dir: str, file_ext: list = ["*"]) -> int:
"""Count the total number of files of a given extension in a directory.
Args:
target_dir (str): The target directory to search.
file_ext (str): The file extension to search for. Default: all extensions.
file_ext (list[str]): A list of file extensions to search for. Default: all
extensions (['*']).
Returns:
int: The number of files with the matching extension within the tree
of the target directory
"""
logger.info("Calculating number of files that will be hashed in %s" % target_dir)

output = len(fnmatch.filter(os.listdir(target_dir), f"*.{file_ext}"))
num_of_files = 0
for ext in file_ext:
num_of_files += len(fnmatch.filter(os.listdir(target_dir), f"*.{ext}"))

logger.info(f"{output} files to be hashed in {target_dir}")
logger.info(f"{num_of_files} files to be hashed in {target_dir}")

return output
return num_of_files


def hashfile(path: str, blocksize: int = 65536) -> Tuple[str, str]:
Expand Down Expand Up @@ -118,7 +121,14 @@ def restart_run(db) -> list:
return [os.path.basename(row["filepath"]) for row in db.all()]


def run_hash(dir: str, count: int, dbfile: str, restart: bool = False, **kwargs):
def run_hash(
dir: str,
count: int,
dbfile: str,
restart: bool = False,
file_ext: list = ["*"],
**kwargs,
):
"""Hash files within a directory structure
Args:
Expand All @@ -128,14 +138,16 @@ def run_hash(dir: str, count: int, dbfile: str, restart: bool = False, **kwargs)
restart (bool): If true, will restart a hash run. dupfile and unfile
must exist since the filenames already hashed will be skipped.
Default: False.
file_ext (list[str]): A list of file extensions to search for. Default: all
extensions (['*']).
"""
# Check the directory path exists
if not os.path.exists(dir):
raise ValueError("Please provide a known filepath!")

hashes_db = TinyDB(dbfile)

total_file_num = get_total_number_of_files(dir)
total_file_num = get_total_number_of_files(dir, file_ext)

if restart:
files_to_skip = restart_run(hashes_db)
Expand All @@ -154,6 +166,7 @@ def run_hash(dir: str, count: int, dbfile: str, restart: bool = False, **kwargs)
executor.submit(hashfile, os.path.join(dirName, filename))
for filename in fileList
if filename not in files_to_skip
if os.path.splitext(filename)[1] in file_ext
]
for future in as_completed(futures):
hash, filepath = future.result()
Expand Down
4 changes: 3 additions & 1 deletion tests/test_hash.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,12 @@ def test_get_total_number_of_files():
dirpath = os.path.join("tests", "testdir")

output1 = get_total_number_of_files(dirpath)
output2 = get_total_number_of_files(dirpath, file_ext="txt")
output2 = get_total_number_of_files(dirpath, file_ext=["txt"])
output3 = get_total_number_of_files(dirpath, file_ext=["txt", "xml"])

assert output1 == 3
assert output2 == 1
assert output3 == 3


def test_hashfile():
Expand Down

0 comments on commit 112fa62

Please sign in to comment.