diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml new file mode 100644 index 0000000..38482cc --- /dev/null +++ b/.github/workflows/python-package.yml @@ -0,0 +1,41 @@ +# This workflow will install Python dependencies, run tests and lint with a variety of Python versions +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python + +name: Python package + +on: + push: + branches: [ "master" ] + pull_request: + branches: [ "master" ] + +jobs: + build: + + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.12"] + + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + sudo apt-get install -y python3 python3-pip python3-setuptools gnupg curl file + curl -fsSL https://pgp.mongodb.com/server-7.0.asc | sudo gpg -o /usr/share/keyrings/mongodb-server-7.0.gpg --dearmor + echo "deb [ arch=amd64,arm64 signed-by=/usr/share/keyrings/mongodb-server-7.0.gpg ] https://repo.mongodb.org/apt/ubuntu jammy/mongodb-org/7.0 multiverse" | sudo tee /etc/apt/sources.list.d/mongodb-org-7.0.list + sudo apt-get update && sudo apt-get install -y mongodb-org && sudo mkdir -p /data/db && mongod & + pip install --upgrade setuptools + pip install --only-binary=numpy,scipy -r requirements.txt + pip install -r requirements-test.txt + - name: pep8 styles + run: | + pycodestyle *.py hashers tests + - name: Test with pytest + run: | + pytest tests/test.py -v diff --git a/.pylintrc b/.pylintrc new file mode 100644 index 0000000..f937284 --- /dev/null +++ b/.pylintrc @@ -0,0 +1,3 @@ +[MASTER] + +extension-pkg-allow-list=av \ No newline at end of file diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 998077d..0000000 --- a/.travis.yml +++ /dev/null @@ -1,12 +0,0 @@ -language: python -dist: focal -python: - - "3.7" - - "3.8" -install: - - sudo apt-get install -y libblas-dev liblapack-dev libatlas-base-dev gfortran libffi-dev automake make libtool autoconf g++ m4 pkg-config - - wget https://github.com/strukturag/libheif/releases/download/v1.10.0/libheif-1.10.0.tar.gz - - tar xvf libheif-1.10.0.tar.gz && pushd libheif-1.10.0 && ./autogen.sh && ./configure --disable-dependency-tracking && make && sudo make install && popd && rm -rf libheif-1.10.0 - - pip install --only-binary=numpy,scipy -r requirements.txt - - pip install -r requirements-test.txt -script: pytest tests/test.py -v diff --git a/README.md b/README.md index 92faee9..00b3c52 100644 --- a/README.md +++ b/README.md @@ -2,26 +2,26 @@ ![](https://api.travis-ci.org/philipbl/duplicate-images.svg) -This Python script finds duplicate images using a [perspective hash (pHash)](http://www.phash.org) to compare images. pHash ignores the image size and file size and instead creates a hash based on the pixels of the image. This allows you to find duplicate pictures that have been rotated, have changed metadata, and slightly edited. +This Python script finds duplicate files: +- any file by exact match of blake2b hash +- images using a [perspective hash (pHash)](http://www.phash.org) to compare images. pHash ignores the image size and file size and instead creates a hash based on the pixels of the image. This allows you to find duplicate pictures that have been rotated, have changed metadata, and slightly edited. +- videos by extracting N frames at respective times and hashing them with perspective hash (see above) -This script hashes images added to it, storing the hash into a database (MongoDB). To find duplicate images, hashes are compared. If the hash is the same between two images, then they are marked as duplicates. A web interface is provided to delete duplicate images easily. If you are feeling lucky, there is an option to automatically delete duplicate files. +This script hashes files added to it, storing the hashes into a database (MongoDB). To find duplicate files, hashes are compared. If the hash is the same between two files, then they are marked as duplicates. A web interface is provided to delete duplicate files easily. If you are feeling lucky, there is an option to automatically delete duplicate files. -As a word of caution, pHash is not perfect. I have found that duplicate pictures sometimes have different hashes and similar (but not the same) pictures have the same hash. This script is a great starting point for cleaning your photo library of duplicate pictures, but make sure you look at the pictures before you delete them. You have been warned! I hold no responsibility for any family memories that might be lost because of this script. - -This script has only been tested with Python 3 and is still pretty rough around the edges. Use at your own risk. +As a word of caution, pHash is not perfect. I have found that duplicate pictures sometimes have different hashes and similar (but not the same) pictures have the same hash. This script is a great starting point for cleaning your photo or video library of duplicate pictures, but make sure you look at the pictures before you delete them. You have been warned! I hold no responsibility for any family memories that might be lost because of this script. ## Requirements -This script requires MongoDB, Python 3.4 or higher, and a few Python modules, as found in `requirements.txt`. - +This script requires MongoDB, Python 3.12 or higher, and a few Python modules, as found in `requirements.txt`. ## Quick Start I suggest you read the usage, but here are the steps to get started right away. These steps assume that MongoDB is already installed on the system. -First, install this script. This can be done by either cloning the repository or [downloading the script](https://github.com/philipbl/duplicate-images/archive/master.zip). +First, install this script. This can be done by either cloning the repository or [downloading the script](https://github.com/bolshevik/duplicate-images/archive/master.zip). ```bash -git clone https://github.com/philipbl/duplicate-images.git +git clone https://github.com/bolshevik/duplicate-images.git ``` Next, download all required modules. This script has only been tested with Python 3. I would suggest that you make a virtual environment, setting Python 3 as the default python executable (`mkvirtualenv --python=/usr/local/bin/python3 `) @@ -34,26 +34,6 @@ Last, run script: python duplicate_finder.py ``` -## On Ubuntu 18.04 - -```bash -# Install Mongo and pip -sudo apt -y install mongodb-server python3-pip -# Disable Mongo service autostart -sudo systemctl disable mongodb.service -# Stop Mongo service -sudo service mongodb stop -``` - -Python 2 is the default version of Python, so we have to call `python3` explicitely: - -```bash -# Install dependencies with Python 3 -pip3 install -r requirements.txt -# “python duplicate_finder.py” will fail, so we have to use Python 3 for every call: -python3 duplicate_finder.py … -``` - ## Example ```bash @@ -76,6 +56,7 @@ Usage: duplicate_finder.py remove ... [--db=] duplicate_finder.py clear [--db=] duplicate_finder.py show [--db=] + duplicate_finder.py cleanup [--db=] duplicate_finder.py find [--print] [--delete] [--match-time] [--trash=] [--db=] [--threshold=] duplicate_finder.py -h | --help @@ -84,13 +65,13 @@ Options: --db= The location of the database or a MongoDB URI. (default: ./db) - --parallel= The number of parallel processes to run to hash the image - files (default: number of CPUs). + --parallel= The number of parallel processes to run to hash the files + (default: number of CPUs). find: - --threshold= Image matching threshold. Number of different bits in Hamming distance. False positives are possible. + --threshold= Hash matching threshold. Number of different bits in Hamming distance. False positives are possible. --print Only print duplicate files rather than displaying HTML file - --delete Move all found duplicate pictures to the trash. This option takes priority over --print. + --delete Move all found duplicate files to the trash. This option takes priority over --print. --match-time Adds the extra constraint that duplicate images must have the same capture times in order to be considered. --trash= Where files will be put when they are deleted (default: ./Trash) @@ -101,14 +82,14 @@ Options: python duplicate_finder.py add /path/to/images ``` -When a path is added, image files are recursively searched for. In particular, `JPEG`, `PNG`, `GIF`, and `TIFF` images are searched for. Any image files found will be hashed. Adding a path uses 8 processes (by default) to hash images in parallel so the CPU usage is very high. +When a path is added, files are recursively searched for. Binary content hash is applied to all files, for image files like `JPEG`, `PNG`, `GIF`, and `TIFF` the perceptive hash is applied. Video hash is applied to video files. Adding a path uses 8 processes (by default) to hash images in parallel so the CPU usage is very high. ### Remove ```bash python duplicate_finder.py remove /path/to/images ``` -A path can be removed from the database. Any image inside that path will be removed from the database. +A path can be removed from the database. Any file inside that path will be removed from the database. ### Clear ```bash @@ -117,6 +98,13 @@ python duplicate_finder.py clear Removes all hashes from the database. +### Cleanup +```bash +python duplicate_finder.py cleanup +``` + +Clean disappeared files from the database. + ### Show ```bash python duplicate_finder.py show @@ -129,7 +117,25 @@ Prints the contents database. duplicate_finder.py find [--print] [--delete] [--match-time] [--trash=] [--threshold=] ``` -Finds duplicate pictures that have been hashed. This will find images that have the same hash stored in the database. There are a few options associated with `find`. By default, when this command is run, a webpage is displayed showing duplicate pictures and a server is started that allows for the pictures to be deleted (images are not actually deleted, but moved to a trash folder -- I really don't want you to make a mistake). The first option, **`--print`**, prints all duplicate pictures and does not display a webpage or start the server. **`--delete`** automatically moves all duplicate images found to the trash. Be careful with this one. **`--match-time`** adds the extra constraint that images must have the same EXIF time stamp to be considered duplicate pictures. Last, `--trash=` lets you select a path to where you want files to be put when they are deleted. The default trash location is `./Trash`. +Finds duplicate files that have been hashed. This will find files that have the same hash stored in the database. There are a few options associated with `find`. By default, when this command is run, a webpage is displayed showing duplicate files and a server is started that allows for the files to be deleted (files are not actually deleted, but moved to a trash folder -- I really don't want you to make a mistake). The first option, **`--print`**, prints all duplicate files and does not display a webpage or start the server. **`--delete`** automatically moves all duplicate files found to the trash. Be careful with this one. **`--match-time`** adds the extra constraint that images must have the same EXIF time stamp to be considered duplicate pictures. `--trash=` lets you select a path to where you want files to be put when they are deleted. The default trash location is `./Trash`. Last, `--threshold=` specifies number of bits of Hamming distance to run fuzzy matching of hashes. + +# Testing + +## Ubuntu 22.04 +``` +sudo apt-get install python3 python3-pip python3-setuptools gnupg curl file +curl -fsSL https://pgp.mongodb.com/server-7.0.asc | \ + sudo gpg -o /usr/share/keyrings/mongodb-server-7.0.gpg \ + --dearmor +echo "deb [ arch=amd64,arm64 signed-by=/usr/share/keyrings/mongodb-server-7.0.gpg ] https://repo.mongodb.org/apt/ubuntu jammy/mongodb-org/7.0 multiverse" | sudo tee /etc/apt/sources.list.d/mongodb-org-7.0.list +sudo apt-get update +sudo apt-get install -y mongodb-org +sudo mkdir /data/db +sudo mongod + +pip install --only-binary=numpy,scipy -r requirements.txt +pip install -r requirements-test.txt +``` ## Disclaimer diff --git a/duplicate_finder.py b/duplicate_finder.py index 188ddf8..011ff3a 100755 --- a/duplicate_finder.py +++ b/duplicate_finder.py @@ -17,14 +17,14 @@ --db= The location of the database or a MongoDB URI. (default: ./db) - --parallel= The number of parallel processes to run to hash the image - files (default: number of CPUs). + --parallel= The number of parallel processes to run to hash the files + (default: number of CPUs). find: - --threshold= Image matching threshold. Number of different bits in Hamming \ + --threshold= Hash matching threshold. Number of different bits in Hamming \ distance. False positives are possible. --print Only print duplicate files rather than displaying HTML file - --delete Move all found duplicate pictures to the trash. This option \ + --delete Move all found duplicate files to the trash. This option \ takes priority over --print. --match-time Adds the extra constraint that duplicate images must have the same capture times in order to be considered. @@ -55,7 +55,7 @@ import pybktree from termcolor import cprint -from hashers import BinaryHasher, ImageHasher +from hashers import BinaryHasher, ImageHasher, VideoHasher # , VideoBarcodeHasher def get_hashers() -> list: @@ -63,6 +63,8 @@ def get_hashers() -> list: return [ BinaryHasher(), ImageHasher(), + VideoHasher(ImageHasher()), + # VideoBarcodeHasher(ImageHasher()), ] @@ -73,7 +75,8 @@ def connect_to_db(db_conn_string='./db'): # Determine db_conn_string is a mongo URI or a path # If this is a URI - if 'mongodb://' == db_conn_string[:10] or 'mongodb+srv://' == db_conn_string[:14]: + if 'mongodb://' == db_conn_string[:10] \ + or 'mongodb+srv://' == db_conn_string[:14]: client = pymongo.MongoClient(db_conn_string) cprint("Connected server...", "yellow") db = client.image_database @@ -117,6 +120,9 @@ def get_files(path: str): :param path: :return: yield absolute path """ + if os.path.isfile(path): + yield path + path = os.path.abspath(path) for root, _, files in os.walk(path): for file in files: @@ -242,7 +248,7 @@ def same_time(dup): """Check if capture_time meta attribute is the same""" items = dup['items'] - if len({(i['meta']['capture_time'] if 'capture_time' in i['meta'] else '') for i in items})>1: + if len({(i['meta']['capture_time'] if 'capture_time' in i['meta'] else '') for i in items}) > 1: return False return True @@ -276,7 +282,21 @@ def find(db, match_time=False): if match_time: dups = (d for d in dups if same_time(d)) - return list(dups) + return make_duplcated_groups_unique(dups) + + +def make_duplcated_groups_unique(dups): + """Deduplicate results by removing same groups of duplicates matched by multiple hashes""" + cprint('Removing same groups of duplicates having multiple matching hashes...') + deduplicated = [] + unique_groups = set() + for dup in dups: + group_ids = ' '.join(sorted(x['file_name'] for x in dup['items'])) + if group_ids not in unique_groups: + unique_groups.add(group_ids) + deduplicated.append(dup) + + return deduplicated def find_threshold(db, threshold=1): @@ -297,40 +317,50 @@ def find_threshold(db, threshold=1): scanned = 0 for document in db.find(): - cprint(f'\r{(scanned * 100 / (cnt - 1))}%', end='') + cprint(f'\r{round(scanned * 100 / (cnt - 1))}%', end='') scanned = scanned + 1 max_size = document['meta']['file_size'] + similar = [] + similar_hashes = [] for doc_hash in document['hashes']: if doc_hash in deduplicated: continue deduplicated.add(doc_hash) int_hash = int.from_bytes(doc_hash, "big") + new_similar = tree.find(int_hash, threshold) + if len(new_similar) > 1: + similar_hashes.append(str(doc_hash)) + similar = similar + new_similar + + if len(similar) > 1: + similar = list(set(similar)) + + similars = [] + similars_name = set() + for (distance, item_hash) in similar: + if distance > 0: + deduplicated.add(item_hash) + + for item in db.find({'hashes': binascii.unhexlify(hex(item_hash)[2:])}): + if item['_id'] in similars_name: + continue + similars_name.add(item['_id']) + item['file_name'] = item['_id'] + similars.append(item) + max_size = max_size if item['meta']['file_size'] <= max_size \ + else item['meta']['file_size'] + + if len(similars) > 1: + dups.append( + { + '_id': similar_hashes, + 'total': len(similars), + 'items': similars, + 'file_size': max_size + } + ) - similar = tree.find(int_hash, threshold) - if len(similar) > 1: - similar = list(set(similar)) - - similars = [] - for (distance, item_hash) in similar: - if distance > 0: - deduplicated.add(item_hash) - - for item in db.find({'hashes': binascii.unhexlify(hex(item_hash)[2:])}): - item['file_name'] = item['_id'] - similars.append(item) - max_size = max_size if item['meta']['file_size'] <= max_size \ - else item['meta']['file_size'] - if len(similars) > 0: - dups.append( - { - '_id': doc_hash, - 'total': len(similars), - 'items': similars, - 'file_size': max_size - } - ) - - return dups + return make_duplcated_groups_unique(dups) def delete_duplicates(duplicates, db): @@ -388,19 +418,29 @@ def display_duplicates(duplicates, db, trash="./Trash/"): def render(duplicates, current, total): env = Environment(loader=FileSystemLoader('template')) + env.filters['hash'] = hash template = env.get_template('index.html') return template.render(duplicates=duplicates, current=current, total=total) with TemporaryDirectory() as folder: - # Generate all of the HTML files - chunk_size = 25 - for i, dups_page in enumerate(chunked(duplicates, chunk_size)): - with open(f'{folder}/{i}.html', 'w', encoding="utf-8") as f: - f.write(render(dups_page, - current=i, - total=math.ceil(len(duplicates) / chunk_size))) + if len(duplicates) == 0: + env = Environment(loader=FileSystemLoader('template')) + template = env.get_template('no_duplicates.html') + with open(f'{folder}/0.html', 'w', encoding="utf-8") as f: + f.write(template.render()) + + else: + # Generate all of the HTML files + chunk_size = 25 + for i, dups_page in enumerate(chunked(duplicates, chunk_size)): + with open(f'{folder}/{i}.html', 'w', encoding="utf-8") as f: + f.write(render(dups_page, + current=i, + total=math.ceil(len(duplicates) / chunk_size) + ) + ) webbrowser.open(f'file://{folder}/0.html') diff --git a/hashers/__init__.py b/hashers/__init__.py index 400ea99..5875c6a 100644 --- a/hashers/__init__.py +++ b/hashers/__init__.py @@ -6,3 +6,4 @@ from .binaryhasher import BinaryHasher from .imagehasher import ImageHasher from .videohasher import VideoHasher +from .videobarcodehasher import VideoBarcodeHasher diff --git a/hashers/abstracthasher.py b/hashers/abstracthasher.py index 78739dc..7204bf1 100644 --- a/hashers/abstracthasher.py +++ b/hashers/abstracthasher.py @@ -6,15 +6,17 @@ class AbstractHasher(ABC): """Interface class of all hashers, implements basic functions""" - def is_applicable(self, file_name: str) -> bool: - """Return if the hasher is applicable, must be overwritten in the child.""" + def is_applicable(self, _: str) -> bool: + """Return if the hasher is applicable, must be overwritten in the + child.""" return False @abstractmethod def hash(self, file_object) -> tuple: """Hash file""" - def _is_matching_magic(self, file_name: str, supported_magics: list) -> bool: + def _is_matching_magic(self, file_name: str, supported_magics: list) \ + -> bool: try: mime = magic.from_file(file_name, mime=True) return mime.rsplit('/', 1)[1] in supported_magics diff --git a/hashers/binaryhasher.py b/hashers/binaryhasher.py index 3915c62..384fd12 100644 --- a/hashers/binaryhasher.py +++ b/hashers/binaryhasher.py @@ -10,7 +10,7 @@ class BinaryHasher(abstracthasher.AbstractHasher): def __init__(self, digest_size=None): self._digest_size = digest_size if digest_size is not None else 16 - def is_applicable(self, file_name: str) -> bool: + def is_applicable(self, _: str) -> bool: return True def hash(self, file_object) -> tuple: diff --git a/hashers/imagehasher.py b/hashers/imagehasher.py index f9b6073..766d1ba 100644 --- a/hashers/imagehasher.py +++ b/hashers/imagehasher.py @@ -25,11 +25,14 @@ def _get_capture_time(self, img): if k in ExifTags.TAGS } return exif["DateTimeOriginal"] - except: + except Exception: return "Time unknown" def hash(self, file_object) -> tuple: - img = Image.open(file_object) + if isinstance(file_object, Image.Image): + img = file_object + else: + img = Image.open(file_object) hashes = [] # hash the image 4 times and rotate it by 90 degrees each time diff --git a/hashers/videobarcodehasher.py b/hashers/videobarcodehasher.py new file mode 100644 index 0000000..cdb8fd8 --- /dev/null +++ b/hashers/videobarcodehasher.py @@ -0,0 +1,46 @@ +"""Video hasher""" +import datetime +import av +from PIL import Image +import numpy as np + +from . import videohasher, abstracthasher + + +class VideoBarcodeHasher(videohasher.VideoHasher): + """Video Barcode hasher + https://pyav.org/docs/develop/cookbook/numpy.html#video-barcode""" + + def __init__(self, image_hasher: abstracthasher.AbstractHasher): + super(VideoBarcodeHasher).__init__(self, image_hasher) + + def hash(self, file_object) -> tuple: + hashes = [] + duration = 0 + frames = 0 + columns = [] + with av.open(file_object) as video_decoder: + stream = video_decoder.streams.video[0] + stream.codec_context.skip_frame = "NONKEY" + + for frame in video_decoder.decode(video=0): + array = frame.to_ndarray(format="rgb24") + # Collapse down to a column. + column = array.mean(axis=1) + # Convert to bytes, as the `mean` turned our array into floats. + column = column.clip(0, 255).astype("uint8") + # Get us in the right shape for the `hstack` below. + column = column.reshape(-1, 1, 3) + columns.append(column) + + if stream.duration is not None: + duration = float(stream.duration * stream.time_base) + frames = stream.frames + + full_array = np.hstack(columns) + full_img = Image.fromarray(full_array, "RGB") + ih = self._image_hasher.hash(full_img) + hashes = hashes + ih[0] + + hashes = list(set(hashes)) # make hashes unique + return self._format_hashes(hashes, frames, str(datetime.timedelta(seconds=duration))) diff --git a/hashers/videohasher.py b/hashers/videohasher.py index c978d26..0fa8ece 100644 --- a/hashers/videohasher.py +++ b/hashers/videohasher.py @@ -1,21 +1,58 @@ """Video hasher""" +import datetime +import av from . import abstracthasher class VideoHasher(abstracthasher.AbstractHasher): """Video hasher""" - def __init__(self, image_hasher: abstracthasher.AbstractHasher): - self._image_hesher = image_hasher + def __init__(self, image_hasher: abstracthasher.AbstractHasher, + frames_number=5): + self._image_hasher = image_hasher + self._frames_number = frames_number def is_applicable(self, file_name: str) -> bool: return self._is_matching_magic( file_name, - [] + ['x-matroska', 'MP2T', 'mp4', 'ogg', 'x-msvideo', + 'webm', 'quicktime'] ) def hash(self, file_object) -> tuple: + hashes = [] + duration = 0 + frames = 0 + with av.open(file_object, mode='r') as video_decoder: + stream = video_decoder.streams.video[0] + stream.codec_context.skip_frame = "NONKEY" + video_duration = video_decoder.duration + fraction = video_duration // self._frames_number + for i in range(self._frames_number): + seek_position = i * fraction + (fraction // 2) + # seek to position but shifted half of the fraction back + try: + video_decoder.seek(seek_position) + for frame in video_decoder.decode(): + if isinstance(frame, av.video.frame.VideoFrame): + ih = self._image_hasher.hash(frame.to_image()) + hashes = hashes + ih[0] + break + except av.error.PermissionError: + pass + + if stream.duration is not None: + duration = float(stream.duration * stream.time_base) + frames = stream.frames + + hashes = list(set(hashes)) # make hashes unique + return self._format_hashes(hashes, frames, str(datetime.timedelta(seconds=duration))) + + def _format_hashes(self, hashes, frames, duration): return ( - [], - {} + hashes, + { + 'total_frames': frames, + 'duration': duration, + } ) diff --git a/requirements-test.txt b/requirements-test.txt index e410cd3..9b83797 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,4 +1,4 @@ -mongomock==3.22.0 -pyfakefs==4.3.3 -pytest==6.2.1 -pytest-cov==2.10.1 +mongomock==4.1.2 +pytest==7.4.3 +pytest-cov==4.1.0 +pycodestyle==2.11.1 diff --git a/requirements.txt b/requirements.txt index d595c21..3ee954b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,10 +4,11 @@ ImageHash==4.3.1 Jinja2==3.1.2 more-itertools==10.1.0 Pillow==10.1.0 -pymongo==4.6.0 +pymongo==4.6.1 python-magic==0.4.27 -termcolor==2.3.0 +termcolor==2.4.0 Werkzeug==3.0.1 Flask-Cors==4.0.0 pybktree==1.1 pillow-heif==0.13.1 +av==11.0.0 diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..bf1ec54 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,4 @@ +[pycodestyle] +count = False +max-line-length = 100 +statistics = True diff --git a/template/index.html b/template/index.html index a0e1ff3..4b6f72d 100644 --- a/template/index.html +++ b/template/index.html @@ -7,52 +7,66 @@ -{% macro image2(dup) -%} +{% macro duplicate_group(dup) -%} - - - + + {% for key, values in dup['items'].items() %} - {% if key == 'id' %} - {% elif key == 'file_name' %} - {% else %} + {% if key not in ('id', 'file_name') %} {% for value in values %} - + {% endfor %} {% endif %} {% endfor %} + + + {% for file_name in dup['items']['file_name'] %} + + {% endfor %} + {% for file_name in dup['items']['file_name'] %} - + {% endfor %} - + {% for id in dup['items']['id'] %} - + {% endfor %}
+
Hash{{ dup['_id'] }}{{ dup['_id'] }}
Max size {{ dup['file_size'] }}
{{ key | replace('_', ' ') | title }}{{ value }}{{ value }}
Filename + {{ file_name }} +
Preview + {% if '.heic' in file_name | lower %} {{ file_name }} {% else %} - - {{ file_name }} - + {% if '.mp4' in file_name | lower or '.avi' in file_name | lower or '.mov' in file_name | lower + or '.ogv' in file_name | lower or '.mkv' in file_name | lower or '.mov' in file_name | lower + or '.webm' in file_name | lower or '.ts' in file_name | lower %} + + {% else %} + + {{ file_name }} + + {% endif %} {% endif %} -
+ -
@@ -83,7 +97,7 @@
{% for dup in duplicates %}
- {{ image2(dup) }} + {{ duplicate_group(dup) }}
{% endfor %} @@ -104,8 +118,9 @@ $(".delete-btn").click(function() { var file_name = $(this).data("name"); - var parent = $(this).parent(); - var column = $(this).parent().parent().prev().find('td').eq($(this).index()-1) + var parent = $(this).parent(); + var item_id = $(this).parent().data("item-id"); + var all_tds = $("td").filter("[data-item-id='" + item_id + "']") $.ajax({ url: 'http://127.0.0.1:5000/picture' + file_name, //encodeURIComponent(file_name), @@ -113,8 +128,7 @@ type: 'DELETE', success: function(data) { if(data == "True") { - parent.addClass('fade'); - column.addClass('fade'); + all_tds.addClass('fade'); } else { $('#file-not-found-alert').addClass('in'); diff --git a/template/no_duplicates.html b/template/no_duplicates.html new file mode 100644 index 0000000..67a9fdf --- /dev/null +++ b/template/no_duplicates.html @@ -0,0 +1,11 @@ + + + + + + +
+

No Duplicates Found!

+
+ + \ No newline at end of file diff --git a/tests/test.py b/tests/test.py index 1fc6528..6b58834 100644 --- a/tests/test.py +++ b/tests/test.py @@ -1,50 +1,63 @@ +"""Autotest""" import os import shutil import mongomock -import pyfakefs.fake_filesystem as fake_fs -import pytest import duplicate_finder def test_get_image_files(): - images = ['tests/images/u.jpg', 'tests/images/u.heic', 'tests/images/file.png', 'tests/images/file.gif', 'tests/images/file.tiff', - 'tests/images/image.txt', 'tests/images/deeply/nested/different.jpg', - 'tests/images/deeply/nested/image/sideways.jpg', 'tests/images/deeply/nested/image/smaller.jpg'] - other = ['tests/images/not_image.txt', 'tests/images/not_image.jpb', 'README.md'] + """test_get_image_files""" + images = ['u.jpg', 'u.heic', 'file.png', 'file.gif', 'file.tiff', + 'image.txt', 'deeply/nested/different.jpg', + 'deeply/nested/image/sideways.jpg', 'deeply/nested/image/smaller.jpg', + 'not_image.jpg', 'not_image.txt'] - assert sorted([str(x).rsplit('/', 1)[1] for x in duplicate_finder.get_image_files('.')]) == \ - sorted([str(x).rsplit('/', 1)[1] for x in images]) + assert sorted([ + str(x).rsplit('/tests/images/', 1)[1] + for x in duplicate_finder.get_files('tests/images/') + ]) == sorted([str(x) for x in images]) def test_hash_file(): + """test_hash_file""" image_name = 'tests/images/u.jpg' result = duplicate_finder.hash_file(image_name) assert result is not None - file, hash_, file_size, image_size, capture_time = result + file, (hashes, _) = result assert file == image_name - assert hash_ == '87d35b107818e5d7963a5d2869d4b4b6c3950a873c7ee11ed2790eba2da2b03d' + assert hashes == [b'bin:b\xcfq\xb4*\x15\x9eJ\xf4n\xd9M\x83zG\x05', + b'img:\xc3\x95\n\x87<~\xe1\x1e', b'img:\xd2y\x0e\xba-\xa2\xb0=', + b'img:\x96:](i\xd4\xb4\xb6', b'img:\x87\xd3[\x10x\x18\xe5\xd7'] result = duplicate_finder.hash_file('tests/images/nothing.png') assert result is None - result = duplicate_finder.hash_file('tests/images/not_image.txt') - assert result is None + file, (hashes, _) = duplicate_finder.hash_file('tests/images/not_image.txt') + assert hashes == [b'bin:\x9b-z\xdf\x0e\xde\x89\x8c\x062\xec\x87\xf6g\x01\xe8'] def test_hash_file_rotated(): + """test_hash_file_rotated""" image_name_1 = 'tests/images/u.jpg' image_name_2 = 'tests/images/deeply/nested/image/sideways.jpg' result_1 = duplicate_finder.hash_file(image_name_1) result_2 = duplicate_finder.hash_file(image_name_2) + hashes_1 = result_1[1][0] + hashes_2 = result_2[1][0] + + # filter only images hashes + hashes_1 = list(filter(lambda hash: b'img:' in hash, hashes_1)) + hashes_2 = list(filter(lambda hash: b'img:' in hash, hashes_2)) - assert result_1[1] == result_2[1] + assert hashes_1.sort() == hashes_2.sort() def test_hash_files_parallel(): + """test_hash_files_parallel""" files = ['tests/images/u.jpg', 'tests/images/nothing.png', 'tests/images/not_image.txt', @@ -53,12 +66,13 @@ def test_hash_files_parallel(): 'tests/images/deeply/nested/image/smaller.jpg'] results = duplicate_finder.hash_files_parallel(files) results = list(results) - assert len(results) == 4 + assert len(results) == 5 - file, hash_, file_size, image_size, capture_time = results[0] + file, (hashes, _) = results[0] assert file == 'tests/images/u.jpg' - assert hash_ == '87d35b107818e5d7963a5d2869d4b4b6c3950a873c7ee11ed2790eba2da2b03d' - + assert hashes == [b'bin:b\xcfq\xb4*\x15\x9eJ\xf4n\xd9M\x83zG\x05', + b'img:\xc3\x95\n\x87<~\xe1\x1e', b'img:\xd2y\x0e\xba-\xa2\xb0=', + b'img:\x96:](i\xd4\xb4\xb6', b'img:\x87\xd3[\x10x\x18\xe5\xd7'] duplicate_finder.NUM_PROCESSES = 1 results_1_process = duplicate_finder.hash_files_parallel(files) @@ -67,23 +81,23 @@ def test_hash_files_parallel(): def test_add_to_database(): + """test_add_to_database""" db = mongomock.MongoClient().image_database.images result = duplicate_finder.hash_file('tests/images/u.jpg') duplicate_finder._add_to_database(*result, db=db) - db_result = db.find_one({'_id' : result[0]}) + db_result = db.find_one({'_id': result[0]}) assert result[0] == db_result['_id'] - assert result[1] == db_result['hash'] - assert result[2] == db_result['file_size'] - assert result[3] == db_result['image_size'] - assert result[4] == db_result['capture_time'] + assert result[1][0] == db_result['hashes'] + assert result[1][1] == db_result['meta'] # Duplicate entry should print out an error duplicate_finder._add_to_database(*result, db=db) def test_in_database(): + """test_in_database""" db = mongomock.MongoClient().image_database.images result = duplicate_finder.hash_file('tests/images/u.jpg') duplicate_finder._add_to_database(*result, db=db) @@ -91,12 +105,13 @@ def test_in_database(): assert duplicate_finder._in_database('tests/images/u.jpg', db) -def test_new_image_files(): +def test_new_files(): + """test_new_files""" db = mongomock.MongoClient().image_database.images result = duplicate_finder.hash_file('tests/images/u.jpg') duplicate_finder._add_to_database(*result, db=db) - results = duplicate_finder.new_image_files(['tests/images/u.jpg', 'another_file'], db) + results = duplicate_finder.new_files(['tests/images/u.jpg', 'another_file'], db) results = list(results) assert len(results) == 1 @@ -104,42 +119,49 @@ def test_new_image_files(): def test_add(): - file_name = '{}/tests/images/u.jpg'.format(os.getcwd()) + """test_add""" + file_name = f'{os.getcwd()}/tests/images/u.jpg' db = mongomock.MongoClient().image_database.images duplicate_finder.add(['tests'], db) - db_result = db.find_one({'_id' : file_name}) + db_result = db.find_one({'_id': file_name}) assert db_result['_id'] == file_name - assert db_result['hash'] == '87d35b107818e5d7963a5d2869d4b4b6c3950a873c7ee11ed2790eba2da2b03d' - assert db.count() > 0 + assert db_result['hashes'] == [b'bin:b\xcfq\xb4*\x15\x9eJ\xf4n\xd9M\x83zG\x05', + b'img:\xc3\x95\n\x87<~\xe1\x1e', b'img:\xd2y\x0e\xba-\xa2\xb0=', + b'img:\x96:](i\xd4\xb4\xb6', b'img:\x87\xd3[\x10x\x18\xe5\xd7'] + assert db.count_documents({}) > 0 + def test_remove(): + """test_remove""" db = mongomock.MongoClient().image_database.images duplicate_finder.add(['tests'], db) - assert db.count() > 0 + assert db.count_documents({}) > 0 duplicate_finder.remove(['test'], db) - assert db.count() > 0 + assert db.count_documents({}) > 0 duplicate_finder.remove(['tests'], db) - assert db.count() == 0 + assert db.count_documents({}) == 0 duplicate_finder.remove(['tests'], db) - assert db.count() == 0 + assert db.count_documents({}) == 0 def test_clear(): + """test_clear""" db = mongomock.MongoClient().image_database.images duplicate_finder.add(['tests'], db) - assert db.count() > 0 + assert db.count_documents({}) > 0 duplicate_finder.clear(db) - assert db.count() == 0 + assert db.count_documents({}) == 0 def test_find(): + """test_find""" db = mongomock.MongoClient().image_database.images duplicate_finder.add(['tests/images/deeply/nested'], db) @@ -152,26 +174,53 @@ def test_find(): time_dups = duplicate_finder.find(db, match_time=True) assert dups == time_dups + +def test_find_videos(): + """test_find_videos""" + db = mongomock.MongoClient().image_database.images + duplicate_finder.clear(db) + duplicate_finder.add(['tests/videos/'], db) + + dups = duplicate_finder.find(db, match_time=False) + + assert len(dups) == 8 + assert {dups[0]['total'], dups[1]['total'], dups[2]['total'], dups[3]['total'], + dups[4]['total'], dups[5]['total'], dups[6]['total'], dups[7]['total']} == {2, 3, 4} + + found = False + for duplicate in dups: + if b'img:\x83+\x03s=\xf2vT' in duplicate['_id']: + found = True + assert duplicate['total'] == 4 + filenames = {item['file_name'].rsplit('tests/videos/', 1)[1] + for item in duplicate['items']} + assert filenames == {'2.mp4', '3.mp4', '4.mkv', '6.avi'} + + assert found is True + + def test_find_fuzzy(): + """test_find_fuzzy""" db = mongomock.MongoClient().image_database.images duplicate_finder.add(['tests/images/'], db) dups = duplicate_finder.find_threshold(db, 0) - assert len(dups) == 2 - assert dups[0]['total'] == 2 - assert dups[1]['total'] == 5 + assert len(dups) == 3 + assert {dups[0]['total'], dups[1]['total'], dups[2]['total']} == {2, 5, 8} dups = duplicate_finder.find_threshold(db, 10) - assert len(dups) == 1 - assert dups[0]['total'] == 8 + assert len(dups) == 2 + assert {dups[0]['total'], dups[1]['total']} == {2, 8} + def test_dedup(): + """test_dedup""" db = mongomock.MongoClient().image_database.images - duplicate_finder.add(['tests'], db) - assert db.count() == 9 + duplicate_finder.add(['tests/images'], db) + assert db.count_documents({}) == 11 dups = duplicate_finder.find(db, match_time=False) - assert len(dups) == 2 + assert len(dups) == 6 duplicate_finder.delete_duplicates(dups, db) @@ -185,11 +234,12 @@ def test_dedup(): assert not os.path.exists(item['file_name']) assert os.path.exists(os.path.join('Trash', os.path.basename(item['file_name']))) - assert db.count() == 4 + assert db.count_documents({}) == 3 # Move files back for dup in dups: for item in dup['items'][1:]: - shutil.move(os.path.join('Trash', os.path.basename(item['file_name'])), + shutil.copy(os.path.join('Trash', os.path.basename(item['file_name'])), item['file_name']) - os.rmdir('Trash') + + shutil.rmtree('Trash') diff --git a/tests/videos/1.mp4 b/tests/videos/1.mp4 new file mode 100644 index 0000000..12c7e90 Binary files /dev/null and b/tests/videos/1.mp4 differ diff --git a/tests/videos/2.mp4 b/tests/videos/2.mp4 new file mode 100644 index 0000000..bc8922a Binary files /dev/null and b/tests/videos/2.mp4 differ diff --git a/tests/videos/3.mp4 b/tests/videos/3.mp4 new file mode 100644 index 0000000..ef1bf05 Binary files /dev/null and b/tests/videos/3.mp4 differ diff --git a/tests/videos/4.mkv b/tests/videos/4.mkv new file mode 100644 index 0000000..f8f2f1f Binary files /dev/null and b/tests/videos/4.mkv differ diff --git a/tests/videos/6.avi b/tests/videos/6.avi new file mode 100644 index 0000000..0daedbd Binary files /dev/null and b/tests/videos/6.avi differ