add image_deduplicator (#72)

* add image_deduplicator * update science_requires.txt * modify image_dedup
modelscope · Nov 15, 2023 · 50f1aca · 50f1aca
1 parent 985e475
commit 50f1aca
Show file tree

Hide file tree

Showing 14 changed files with 357 additions and 7 deletions.
diff --git a/configs/config_all.yaml b/configs/config_all.yaml
@@ -198,6 +198,8 @@ process:
       hamming_distance: 4                                     # the max hamming distance to regard 2 samples as similar enough pair. Should be less than num_blocks always
       lowercase: true                                         # whether to convert text to lower case
       ignore_pattern: null                                    # whether to ignore sub-strings with specific pattern when computing simhash.
+  - image_deduplicator:                                     # deduplicator to deduplicate samples at document-level using exact matching of images between documents.
+      method: phash                                           # hash method for image. One of [phash, dhash, whash, ahash]
 
   # Selector ops
   - topk_specified_field_selector:                          # selector to select top samples based on the sorted specified field

diff --git a/data_juicer/ops/deduplicator/__init__.py b/data_juicer/ops/deduplicator/__init__.py
@@ -1,2 +1,2 @@
 from . import (document_deduplicator, document_minhash_deduplicator,
-               document_simhash_deduplicator)
+               document_simhash_deduplicator, image_deduplicator)
diff --git a/data_juicer/ops/deduplicator/image_deduplicator.py b/data_juicer/ops/deduplicator/image_deduplicator.py
@@ -0,0 +1,123 @@
+from collections import defaultdict
+from typing import Dict, Set
+
+import numpy as np
+from imagededup.methods import AHash, DHash, PHash, WHash
+
+from data_juicer.utils.constant import Fields, HashKeys
+from data_juicer.utils.mm_utils import load_image
+
+from ..base_op import OPERATORS, Deduplicator
+from ..op_fusion import LOADED_IMAGES
+
+HASH_METHOD = {
+    'phash': PHash(),
+    'dhash': DHash(),
+    'whash': WHash(),
+    'ahash': AHash()
+}
+
+
+@OPERATORS.register_module('image_deduplicator')
+@LOADED_IMAGES.register_module('image_deduplicator')
+class ImageDeduplicator(Deduplicator):
+    """
+    Deduplicator to deduplicate samples at document-level using exact matching
+    of images between documents.
+    """
+
+    def __init__(self, method: str = 'phash', *args, **kwargs):
+        """
+        Initialization method.
+
+        :param method: hash method for image
+        :param args: extra args
+        :param kwargs: extra args
+        """
+        super().__init__(*args, **kwargs)
+        if method not in HASH_METHOD.keys():
+            raise ValueError(f'Keep strategy [{method}] is not supported. '
+                             f'Can only be one of {HASH_METHOD.keys()}.')
+        self.hasher = HASH_METHOD[method]
+
+    def compute_hash(self, sample, context=False):
+        # check if it's computed already
+        if HashKeys.imagehash in sample:
+            return sample
+
+        # there is no image in this sample
+        sample[HashKeys.imagehash] = ''
+        if self.image_key not in sample or not sample[self.image_key]:
+            return sample
+
+        # load images
+        loaded_image_keys = sample[self.image_key]
+        images = {}
+        for loaded_image_key in loaded_image_keys:
+            if context and loaded_image_key in sample[Fields.context]:
+                # load from context
+                images[loaded_image_key] = sample[
+                    Fields.context][loaded_image_key]
+            else:
+                if loaded_image_key not in images:
+                    # avoid load the same images
+                    image = load_image(loaded_image_key)
+                    images[loaded_image_key] = image
+                    if context:
+                        # store the image data into context
+                        sample[Fields.context][loaded_image_key] = image
+
+        # compute hash
+        for key in images:
+            sample[HashKeys.imagehash] += self.hasher.encode_image(
+                image_array=np.array(images[key]))
+        return sample
+
+    def process(self, dataset, show_num=0):
+        """
+        For doc-level, dataset --> dataset.
+
+        :param dataset: input dataset
+        :param show_num: number of traced samples used when tracer is
+            open.
+        :return: deduplicated dataset and the sampled duplicate pairs.
+        """
+        # no need to deduplicate because too few samples
+        if len(dataset) <= 1:
+            return dataset, {}
+
+        dup_hashes = None
+        if show_num > 0:
+            # sample duplicate pairs
+            hash2ids: Dict[int, Set[int]] = defaultdict(set)
+            for sid, hash_val in enumerate(dataset[HashKeys.imagehash]):
+                if hash_val:
+                    hash2ids[hash_val].add(sid)
+            dup_samples = sorted(list(hash2ids.items()),
+                                 key=lambda x: len(x[1]),
+                                 reverse=True)
+            dup_hashes = set([
+                item[0] for item in dup_samples if len(item[1]) > 1
+            ][:show_num])
+
+        def _filter_dup_helper(sample, hashes):
+            hash = sample[HashKeys.imagehash]
+            if not hash:
+                return True
+            if show_num > 0 and hash in dup_hashes \
+                    and len(dup_pairs[hash]) < 2:
+                # tracer is open and not enough duplicate sample pairs
+                dup_pairs[hash].append(sample)
+            if hash in hashes:
+                return False
+            else:
+                hashes.add(hash)
+                return True
+
+        hashes = set()
+        dup_pairs = {hash_v: [] for hash_v in dup_hashes} if dup_hashes else {}
+        dataset = dataset.filter(
+            _filter_dup_helper,
+            fn_kwargs=dict(hashes=hashes),
+            load_from_cache_file=False if show_num > 0 else True)  # num_proc=1
+        return dataset, dup_pairs
diff --git a/data_juicer/utils/constant.py b/data_juicer/utils/constant.py
@@ -39,6 +39,9 @@ class HashKeys(object):
     minhash = DEFAULT_PREFIX + 'minhash'
     simhash = DEFAULT_PREFIX + 'simhash'
 
+    # image
+    imagehash = DEFAULT_PREFIX + 'imagehash'
+
 
 class InterVars(object):
     # text

diff --git a/demos/overview_scan/app.py b/demos/overview_scan/app.py
@@ -90,7 +90,7 @@
 | Formatter         |   7    | Discovers, loads, and canonicalizes source data |
 | Mapper            |   21   | Edits and transforms samples                    |
 | Filter            |   19   | Filters out low-quality samples                 |
-| Deduplicator      |   3    | Detects and removes duplicate samples           |
+| Deduplicator      |   4    | Detects and removes duplicate samples           |
 | Selector          |   2    | Selects top samples based on ranking            |
 '''
 
@@ -164,6 +164,7 @@
 | document_deduplicator         | General | en, zh | Deduplicate samples at document-level by comparing MD5 hash |
 | document_minhash_deduplicator | General | en, zh | Deduplicate samples at document-level using MinHashLSH      |
 | document_simhash_deduplicator | General | en, zh | Deduplicate samples at document-level using SimHash         |
+| image_deduplicator            | Image   |   -    | Deduplicate samples at document-level using exact matching of images between documents |
 ''',
     'selector':
     '''

diff --git a/docs/Operators.md b/docs/Operators.md
@@ -12,7 +12,7 @@ The operators in Data-Juicer are categorized into 5 types.
 | [ Formatter ]( #formatter )       |   7    | Discovers, loads, and canonicalizes source data |
 | [ Mapper ]( #mapper )             |   21   | Edits and transforms samples                    |
 | [ Filter ]( #filter )             |   19   | Filters out low-quality samples                 |
-| [ Deduplicator ]( #deduplicator ) |   3    | Detects and removes duplicate samples           |
+| [ Deduplicator ]( #deduplicator ) |   4    | Detects and removes duplicate samples           |
 | [ Selector ]( #selector )         |   2    | Selects top samples based on ranking            |
 
 
@@ -102,6 +102,7 @@ All the specific operators are listed below, each featured with several capabili
 | document_deduplicator         | General | en, zh | Deduplicate samples at document-level by comparing MD5 hash |
 | document_minhash_deduplicator | General | en, zh | Deduplicate samples at document-level using MinHashLSH      |
 | document_simhash_deduplicator | General | en, zh | Deduplicate samples at document-level using SimHash         |
+| image_deduplicator            | Image   |   -    | Deduplicate samples at document-level using exact matching of images between documents |
 
 
 ## Selector <a name="selector"/>

diff --git a/docs/Operators_ZH.md b/docs/Operators_ZH.md
@@ -11,7 +11,7 @@ Data-Juicer 中的算子分为以下 5 种类型。
 | [ Formatter ]( #formatter )        |  7 | 发现、加载、规范化原始数据 |
 | [ Mapper ]( #mapper )              | 21 | 对数据样本进行编辑和转换  |
 | [ Filter ]( #filter )              | 19  | 过滤低质量样本       |
-| [ Deduplicator ]( #deduplicator )  |  3 | 识别、删除重复样本     |
+| [ Deduplicator ]( #deduplicator )  |  4 | 识别、删除重复样本     |
 | [ Selector ]( #selector )          |  2 | 基于排序选取高质量样本   |
 
 下面列出所有具体算子，每种算子都通过多个标签来注明其主要功能。
@@ -98,6 +98,7 @@ Data-Juicer 中的算子分为以下 5 种类型。
 | document_deduplicator          | General  | en, zh  | 通过比较 MD5 哈希值在文档级别对样本去重                        |
 | document_minhash_deduplicator  | General  | en, zh  | 使用 MinHashLSH 在文档级别对样本去重                      |
 | document_simhash_deduplicator  | General  | en, zh  | 使用 SimHash 在文档级别对样本去重                         |
+| image_deduplicator             | Image    |   -     | 使用文档之间图像的精确匹配在文档级别删除重复样本 |
 
 ## Selector <a name="selector"/>
 

diff --git a/environments/science_requires.txt b/environments/science_requires.txt
@@ -14,4 +14,5 @@ nlpcda
 nltk
 transformers
 opencc==1.1.6
+imagededup
 torch
diff --git a/tests/ops/data/img4.png b/tests/ops/data/img4.png
diff --git a/tests/ops/data/img5.jpg b/tests/ops/data/img5.jpg
diff --git a/tests/ops/data/img6.jpg b/tests/ops/data/img6.jpg
diff --git a/tests/ops/data/img7.jpg b/tests/ops/data/img7.jpg