Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added image_size_filter #73

Merged
merged 6 commits into from
Nov 15, 2023
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions configs/config_all.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,10 @@ process:
min_ratio: 0.333 # the min aspect ratio of filter range
max_ratio: 3.0 # the max aspect ratio of filter range
any_or_all: any # keep this sample when any/all images meet the filter condition
- image_size_filter: # filter samples according to the size of images (in bytes) within them
min_size: "0" # the min size of filter range
max_ratio: "1TB" # the max size of filter range
yxdyc marked this conversation as resolved.
Show resolved Hide resolved
any_or_all: any # keep this sample when any/all images meet the filter condition
- language_id_score_filter: # filter text in specific language with language scores larger than a specific max value
lang: en # keep text in what language
min_score: 0.8 # the min language scores to filter text
Expand Down
12 changes: 6 additions & 6 deletions data_juicer/ops/filter/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from . import (alphanumeric_filter, average_line_length_filter,
character_repetition_filter, flagged_words_filter,
image_aspect_ratio_filter, language_id_score_filter,
maximum_line_length_filter, perplexity_filter,
special_characters_filter, specified_field_filter,
specified_numeric_field_filter, stopwords_filter, suffix_filter,
text_length_filter, token_num_filter, word_num_filter,
word_repetition_filter)
image_aspect_ratio_filter, image_size_filter,
language_id_score_filter, maximum_line_length_filter,
perplexity_filter, special_characters_filter,
specified_field_filter, specified_numeric_field_filter,
stopwords_filter, suffix_filter, text_length_filter,
token_num_filter, word_num_filter, word_repetition_filter)
76 changes: 76 additions & 0 deletions data_juicer/ops/filter/image_size_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import numpy as np

from data_juicer.utils.constant import Fields, StatsKeys
from data_juicer.utils.mm_utils import get_image_size, size_to_bytes

from ..base_op import OPERATORS, Filter
from ..op_fusion import LOADED_IMAGES


@OPERATORS.register_module('image_size_filter')
@LOADED_IMAGES.register_module('image_size_filter')
yxdyc marked this conversation as resolved.
Show resolved Hide resolved
class ImageSizeFilter(Filter):
"""Keep data samples whose image size (in bytes/kb/MB/...) within a
specific range.
"""

def __init__(self,
min_size: str = '0',
max_size: str = '1TB',
any_or_all: str = 'any',
*args,
**kwargs):
"""
Initialization method.

:param min_size: The min image size to keep samples. set to be "0" by
default for no size constraint
:param max_size: The max image size to keep samples. set to be
"1Tb" by default, an approximate for un-limited case
:param any_or_all: keep this sample with 'any' or 'all' strategy of
all images. 'any': keep this sample if any images meet the
condition. 'all': keep this sample only if all images meet the
condition.
:param args: extra args
:param kwargs: extra args
"""
super().__init__(*args, **kwargs)
self.min_size = size_to_bytes(min_size)
self.max_size = size_to_bytes(max_size)
if any_or_all not in ['any', 'all']:
raise ValueError(f'Keep strategy [{any_or_all}] is not supported. '
f'Can only be one of ["any", "all"].')
self.any = (any_or_all == 'any')

def compute_stats(self, sample, context=False):
# check if it's computed already
if StatsKeys.image_sizes in sample[Fields.stats]:
return sample

# there is no image in this sample
if self.image_key not in sample or not sample[self.image_key]:
sample[Fields.stats][StatsKeys.image_sizes] = np.array(
[], dtype=np.float64)
return sample

# for size calculation, no need to load images into memory
sample[Fields.stats][StatsKeys.image_sizes] = [
get_image_size(img_path) for img_path in sample[self.image_key]
]

return sample

def process(self, sample):
image_sizes = sample[Fields.stats][StatsKeys.image_sizes]
keep_bools = np.array([
self.min_size <= image_size <= self.max_size
for image_size in image_sizes
])
if len(keep_bools) <= 0:
return True

# different strategies
if self.any:
return keep_bools.any()
else:
return keep_bools.all()
1 change: 1 addition & 0 deletions data_juicer/utils/constant.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ class StatsKeys(object):

# image
aspect_ratios = 'aspect_ratios'
image_sizes = 'image_sizes'


class HashKeys(object):
Expand Down
44 changes: 44 additions & 0 deletions data_juicer/utils/mm_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,47 @@ def load_image(path):
img_feature = Image()
img = img_feature.decode_example(img_feature.encode_example(path))
return img


def get_image_size(path):
import os
return os.path.getsize(path)


def size_to_bytes(size):
alphabets_list = [char for char in size if char.isalpha()]
numbers_list = [char for char in size if char.isdigit()]

if len(numbers_list) == 0:
raise ValueError(f'Your input `size` does not contain numbers: {size}')

size_numbers = int(float(''.join(numbers_list)))

if len(alphabets_list) == 0:
# by default, if users do not specify the units, the number will be
# regarded as in bytes
return size_numbers

suffix = ''.join(alphabets_list).lower()

if suffix == 'kb' or suffix == 'kib':
return size_numbers << 10
elif suffix == 'mb' or suffix == 'mib':
return size_numbers << 20
elif suffix == 'gb' or suffix == 'gib':
return size_numbers << 30
elif suffix == 'tb' or suffix == 'tib':
return size_numbers << 40
elif suffix == 'pb' or suffix == 'pib':
return size_numbers << 50
elif suffix == 'eb' or suffix == 'eib':
return size_numbers << 60
elif suffix == 'zb' or suffix == 'zib':
return size_numbers << 70
elif suffix == 'yb' or suffix == 'yib':
return size_numbers << 80
else:
raise ValueError(f'You specified unidentifiable unit: {suffix}, '
f'expected in [KB, MB, GB, TB, PB, EB, ZB, YB, '
f'KiB, MiB, GiB, TiB, PiB, EiB, ZiB, YiB], '
f'(case insensitive, counted by *Bytes*).')
3 changes: 2 additions & 1 deletion demos/overview_scan/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@
|-----------------------------------|:------:|-------------------------------------------------|
| Formatter | 7 | Discovers, loads, and canonicalizes source data |
| Mapper | 21 | Edits and transforms samples |
| Filter | 17 | Filters out low-quality samples |
| Filter | 18 | Filters out low-quality samples |
| Deduplicator | 3 | Detects and removes duplicate samples |
| Selector | 2 | Selects top samples based on ranking |
'''
Expand Down Expand Up @@ -142,6 +142,7 @@
| character_repetition_filter | General | en, zh | Keeps samples with char-level n-gram repetition ratio within the specified range |
| flagged_words_filter | General | en, zh | Keeps samples with flagged-word ratio below the specified threshold |
| image_aspect_ratio_filter | Image | - | Keeps samples contains images with aspect ratios within specific range |
| image_size_filter | Image | - | Keeps samples contains images whose size in bytes are within specific range |
| language_id_score_filter | General | en, zh | Keeps samples of the specified language, judged by a predicted confidence score |
| maximum_line_length_filter | Code | en, zh | Keeps samples with maximum line length within the specified range |
| perplexity_filter | General | en, zh | Keeps samples with perplexity score below the specified threshold |
Expand Down
3 changes: 2 additions & 1 deletion docs/Operators.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ The operators in Data-Juicer are categorized into 5 types.
|-----------------------------------|:------:|-------------------------------------------------|
| [ Formatter ]( #formatter ) | 7 | Discovers, loads, and canonicalizes source data |
| [ Mapper ]( #mapper ) | 21 | Edits and transforms samples |
| [ Filter ]( #filter ) | 17 | Filters out low-quality samples |
| [ Filter ]( #filter ) | 18 | Filters out low-quality samples |
| [ Deduplicator ]( #deduplicator ) | 3 | Detects and removes duplicate samples |
| [ Selector ]( #selector ) | 2 | Selects top samples based on ranking |

Expand Down Expand Up @@ -77,6 +77,7 @@ All the specific operators are listed below, each featured with several capabili
| character_repetition_filter | General | en, zh | Keeps samples with char-level n-gram repetition ratio within the specified range |
| flagged_words_filter | General | en, zh | Keeps samples with flagged-word ratio below the specified threshold |
| image_aspect_ratio_filter | Image | - | Keeps samples contains images with aspect ratios within specific range |
| image_size_filter | Image | - | Keeps samples contains images whose size in bytes are within specific range |
| language_id_score_filter | General | en, zh | Keeps samples of the specified language, judged by a predicted confidence score |
| maximum_line_length_filter | Code | en, zh | Keeps samples with maximum line length within the specified range |
| perplexity_filter | General | en, zh | Keeps samples with perplexity score below the specified threshold |
Expand Down
1 change: 1 addition & 0 deletions docs/Operators_ZH.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ Data-Juicer 中的算子分为以下 5 种类型。
| character_repetition_filter | General | en, zh | 保留 char-level n-gram 重复比率在指定范围内的样本 |
| flagged_words_filter | General | en, zh | 保留使标记字比率保持在指定阈值以下的样本 |
| image_aspect_ratio_filter | Image | - | 保留样本中包含的图片的宽高比在指定范围内的样本 |
| image_size_filter | Image | - | 保留样本中包含的图片的大小(bytes)在指定范围内的样本 |
| language_id_score_filter | General | en, zh | 保留特定语言的样本,通过预测的置信度得分来判断 |
| maximum_line_length_filter | Code | en, zh | 保留最大行长度在指定范围内的样本 |
| perplexity_filter | General | en, zh | 保留困惑度低于指定阈值的样本 |
Expand Down
118 changes: 118 additions & 0 deletions tests/ops/filter/test_image_size_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
import os
import unittest

from datasets import Dataset

from data_juicer.ops.filter.image_size_filter import ImageSizeFilter
from data_juicer.utils.constant import Fields


class ImageSizeFilterTest(unittest.TestCase):

data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)),
'..', 'data')
img1_path = os.path.join(data_path, 'img1.png')
img2_path = os.path.join(data_path, 'img2.jpg')
img3_path = os.path.join(data_path, 'img3.jpg')

def _run_image_size_filter(self,
dataset: Dataset, target_list,
op):
if Fields.stats not in dataset.features:
dataset = dataset.add_column(name=Fields.stats,
column=[{}] * dataset.num_rows)
dataset = dataset.map(op.compute_stats)
dataset = dataset.filter(op.process)
dataset = dataset.select_columns(column_names=[op.image_key])
res_list = dataset.to_list()
self.assertEqual(res_list, target_list)

def test_min_max(self):

ds_list = [{
'images': [self.img1_path] # 171KB
}, {
'images': [self.img2_path] # 189KB
}, {
'images': [self.img3_path] # 114KB
}]
tgt_list = [{
'images': [self.img1_path]
}]
dataset = Dataset.from_list(ds_list)
op = ImageSizeFilter(min_size="120kb", max_size="180KB")
self._run_image_size_filter(dataset, tgt_list, op)

def test_min(self):

ds_list = [{
'images': [self.img1_path] # 171KB
}, {
'images': [self.img2_path] # 189KB
}, {
'images': [self.img3_path] # 114KB
}]
tgt_list = [{
'images': [self.img1_path]
}, {
'images': [self.img2_path]
}]
dataset = Dataset.from_list(ds_list)
op = ImageSizeFilter(min_size="120kib")
self._run_image_size_filter(dataset, tgt_list, op)

def test_max(self):

ds_list = [{
'images': [self.img1_path] # 171KB
}, {
'images': [self.img2_path] # 189KB
}, {
'images': [self.img3_path] # 114KB
}]
tgt_list = [{
'images': [self.img1_path]
}, {
'images': [self.img3_path]
}]
dataset = Dataset.from_list(ds_list)
op = ImageSizeFilter(max_size="180KiB")
self._run_image_size_filter(dataset, tgt_list, op)

def test_any(self):

ds_list = [{
'images': [self.img1_path, self.img2_path]
}, {
'images': [self.img2_path, self.img3_path]
}, {
'images': [self.img1_path, self.img3_path]
}]
tgt_list = [{
'images': [self.img1_path, self.img2_path]
}, {
'images': [self.img1_path, self.img3_path]
}]
dataset = Dataset.from_list(ds_list)
op = ImageSizeFilter(min_size="120kb", max_size="180KB",
any_or_all='any')
self._run_image_size_filter(dataset, tgt_list, op)

def test_all(self):

ds_list = [{
'images': [self.img1_path, self.img2_path]
}, {
'images': [self.img2_path, self.img3_path]
}, {
'images': [self.img1_path, self.img3_path]
}]
tgt_list = []
dataset = Dataset.from_list(ds_list)
op = ImageSizeFilter(min_size="120kb", max_size="180KB",
any_or_all='all')
self._run_image_size_filter(dataset, tgt_list, op)


if __name__ == '__main__':
unittest.main()