Skip to content

Commit

Permalink
add image_deduplicator (#72)
Browse files Browse the repository at this point in the history
* add image_deduplicator

* update science_requires.txt

* modify image_dedup
  • Loading branch information
chenhesen authored Nov 15, 2023
1 parent 985e475 commit 50f1aca
Show file tree
Hide file tree
Showing 14 changed files with 357 additions and 7 deletions.
2 changes: 2 additions & 0 deletions configs/config_all.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,8 @@ process:
hamming_distance: 4 # the max hamming distance to regard 2 samples as similar enough pair. Should be less than num_blocks always
lowercase: true # whether to convert text to lower case
ignore_pattern: null # whether to ignore sub-strings with specific pattern when computing simhash.
- image_deduplicator: # deduplicator to deduplicate samples at document-level using exact matching of images between documents.
method: phash # hash method for image. One of [phash, dhash, whash, ahash]

# Selector ops
- topk_specified_field_selector: # selector to select top samples based on the sorted specified field
Expand Down
2 changes: 1 addition & 1 deletion data_juicer/ops/deduplicator/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
from . import (document_deduplicator, document_minhash_deduplicator,
document_simhash_deduplicator)
document_simhash_deduplicator, image_deduplicator)
123 changes: 123 additions & 0 deletions data_juicer/ops/deduplicator/image_deduplicator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
from collections import defaultdict
from typing import Dict, Set

import numpy as np
from imagededup.methods import AHash, DHash, PHash, WHash

from data_juicer.utils.constant import Fields, HashKeys
from data_juicer.utils.mm_utils import load_image

from ..base_op import OPERATORS, Deduplicator
from ..op_fusion import LOADED_IMAGES

HASH_METHOD = {
'phash': PHash(),
'dhash': DHash(),
'whash': WHash(),
'ahash': AHash()
}


@OPERATORS.register_module('image_deduplicator')
@LOADED_IMAGES.register_module('image_deduplicator')
class ImageDeduplicator(Deduplicator):
"""
Deduplicator to deduplicate samples at document-level using exact matching
of images between documents.
"""

def __init__(self, method: str = 'phash', *args, **kwargs):
"""
Initialization method.
:param method: hash method for image
:param args: extra args
:param kwargs: extra args
"""
super().__init__(*args, **kwargs)
if method not in HASH_METHOD.keys():
raise ValueError(f'Keep strategy [{method}] is not supported. '
f'Can only be one of {HASH_METHOD.keys()}.')
self.hasher = HASH_METHOD[method]

def compute_hash(self, sample, context=False):
# check if it's computed already
if HashKeys.imagehash in sample:
return sample

# there is no image in this sample
sample[HashKeys.imagehash] = ''
if self.image_key not in sample or not sample[self.image_key]:
return sample

# load images
loaded_image_keys = sample[self.image_key]
images = {}
for loaded_image_key in loaded_image_keys:
if context and loaded_image_key in sample[Fields.context]:
# load from context
images[loaded_image_key] = sample[
Fields.context][loaded_image_key]
else:
if loaded_image_key not in images:
# avoid load the same images
image = load_image(loaded_image_key)
images[loaded_image_key] = image
if context:
# store the image data into context
sample[Fields.context][loaded_image_key] = image

# compute hash
for key in images:
sample[HashKeys.imagehash] += self.hasher.encode_image(
image_array=np.array(images[key]))
return sample

def process(self, dataset, show_num=0):
"""
For doc-level, dataset --> dataset.
:param dataset: input dataset
:param show_num: number of traced samples used when tracer is
open.
:return: deduplicated dataset and the sampled duplicate pairs.
"""
# no need to deduplicate because too few samples
if len(dataset) <= 1:
return dataset, {}

dup_hashes = None
if show_num > 0:
# sample duplicate pairs
hash2ids: Dict[int, Set[int]] = defaultdict(set)
for sid, hash_val in enumerate(dataset[HashKeys.imagehash]):
if hash_val:
hash2ids[hash_val].add(sid)
dup_samples = sorted(list(hash2ids.items()),
key=lambda x: len(x[1]),
reverse=True)
dup_hashes = set([
item[0] for item in dup_samples if len(item[1]) > 1
][:show_num])

def _filter_dup_helper(sample, hashes):
hash = sample[HashKeys.imagehash]
if not hash:
return True
if show_num > 0 and hash in dup_hashes \
and len(dup_pairs[hash]) < 2:
# tracer is open and not enough duplicate sample pairs
dup_pairs[hash].append(sample)
if hash in hashes:
return False
else:
hashes.add(hash)
return True

hashes = set()
dup_pairs = {hash_v: [] for hash_v in dup_hashes} if dup_hashes else {}
dataset = dataset.filter(
_filter_dup_helper,
fn_kwargs=dict(hashes=hashes),
load_from_cache_file=False if show_num > 0 else True) # num_proc=1
return dataset, dup_pairs
3 changes: 3 additions & 0 deletions data_juicer/utils/constant.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ class HashKeys(object):
minhash = DEFAULT_PREFIX + 'minhash'
simhash = DEFAULT_PREFIX + 'simhash'

# image
imagehash = DEFAULT_PREFIX + 'imagehash'


class InterVars(object):
# text
Expand Down
3 changes: 2 additions & 1 deletion demos/overview_scan/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@
| Formatter | 7 | Discovers, loads, and canonicalizes source data |
| Mapper | 21 | Edits and transforms samples |
| Filter | 19 | Filters out low-quality samples |
| Deduplicator | 3 | Detects and removes duplicate samples |
| Deduplicator | 4 | Detects and removes duplicate samples |
| Selector | 2 | Selects top samples based on ranking |
'''

Expand Down Expand Up @@ -164,6 +164,7 @@
| document_deduplicator | General | en, zh | Deduplicate samples at document-level by comparing MD5 hash |
| document_minhash_deduplicator | General | en, zh | Deduplicate samples at document-level using MinHashLSH |
| document_simhash_deduplicator | General | en, zh | Deduplicate samples at document-level using SimHash |
| image_deduplicator | Image | - | Deduplicate samples at document-level using exact matching of images between documents |
''',
'selector':
'''
Expand Down
3 changes: 2 additions & 1 deletion docs/Operators.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ The operators in Data-Juicer are categorized into 5 types.
| [ Formatter ]( #formatter ) | 7 | Discovers, loads, and canonicalizes source data |
| [ Mapper ]( #mapper ) | 21 | Edits and transforms samples |
| [ Filter ]( #filter ) | 19 | Filters out low-quality samples |
| [ Deduplicator ]( #deduplicator ) | 3 | Detects and removes duplicate samples |
| [ Deduplicator ]( #deduplicator ) | 4 | Detects and removes duplicate samples |
| [ Selector ]( #selector ) | 2 | Selects top samples based on ranking |


Expand Down Expand Up @@ -102,6 +102,7 @@ All the specific operators are listed below, each featured with several capabili
| document_deduplicator | General | en, zh | Deduplicate samples at document-level by comparing MD5 hash |
| document_minhash_deduplicator | General | en, zh | Deduplicate samples at document-level using MinHashLSH |
| document_simhash_deduplicator | General | en, zh | Deduplicate samples at document-level using SimHash |
| image_deduplicator | Image | - | Deduplicate samples at document-level using exact matching of images between documents |


## Selector <a name="selector"/>
Expand Down
3 changes: 2 additions & 1 deletion docs/Operators_ZH.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ Data-Juicer 中的算子分为以下 5 种类型。
| [ Formatter ]( #formatter ) | 7 | 发现、加载、规范化原始数据 |
| [ Mapper ]( #mapper ) | 21 | 对数据样本进行编辑和转换 |
| [ Filter ]( #filter ) | 19 | 过滤低质量样本 |
| [ Deduplicator ]( #deduplicator ) | 3 | 识别、删除重复样本 |
| [ Deduplicator ]( #deduplicator ) | 4 | 识别、删除重复样本 |
| [ Selector ]( #selector ) | 2 | 基于排序选取高质量样本 |

下面列出所有具体算子,每种算子都通过多个标签来注明其主要功能。
Expand Down Expand Up @@ -98,6 +98,7 @@ Data-Juicer 中的算子分为以下 5 种类型。
| document_deduplicator | General | en, zh | 通过比较 MD5 哈希值在文档级别对样本去重 |
| document_minhash_deduplicator | General | en, zh | 使用 MinHashLSH 在文档级别对样本去重 |
| document_simhash_deduplicator | General | en, zh | 使用 SimHash 在文档级别对样本去重 |
| image_deduplicator | Image | - | 使用文档之间图像的精确匹配在文档级别删除重复样本 |

## Selector <a name="selector"/>

Expand Down
1 change: 1 addition & 0 deletions environments/science_requires.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,5 @@ nlpcda
nltk
transformers
opencc==1.1.6
imagededup
torch
Binary file added tests/ops/data/img4.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added tests/ops/data/img5.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added tests/ops/data/img6.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added tests/ops/data/img7.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading

0 comments on commit 50f1aca

Please sign in to comment.