Skip to content

Commit

Permalink
fix conflicts
Browse files Browse the repository at this point in the history
  • Loading branch information
chenhesen committed Nov 15, 2023
2 parents 5580fc9 + 985e475 commit 85e8f42
Show file tree
Hide file tree
Showing 14 changed files with 657 additions and 5 deletions.
10 changes: 10 additions & 0 deletions configs/config_all.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,12 @@ process:
rep_len: 10 # repetition length for char-level n-gram
min_ratio: 0.0 # the min ratio of filter range
max_ratio: 0.5 # the max ratio of filter range
- clip_similarity_filter: # filter samples according to the similarity between text and images.
hf_clip: openai/clip-vit-base-patch32 # name of used Hugging Face clip
min_score: 0.1 # the min similarity of filter range
max_score: 1.0 # the max similarity of filter range
reduce_mode: avg # reduce mode when one text corresponds to multiple images in a chunk, must be one of ['avg','max', 'min'].
any_or_all: any # keep this sample when any/all images meet the filter condition
- flagged_words_filter: # filter text with the flagged-word ratio larger than a specific max value
lang: en # consider flagged words in what language
tokenization: false # whether to use model to tokenize documents
Expand All @@ -120,6 +126,10 @@ process:
min_ratio: 0.333 # the min aspect ratio of filter range
max_ratio: 3.0 # the max aspect ratio of filter range
any_or_all: any # keep this sample when any/all images meet the filter condition
- image_size_filter: # filter samples according to the size of images (in bytes) within them
min_size: "0" # the min size of filter range
max_size: "1TB" # the max size of filter range
any_or_all: any # keep this sample when any/all images meet the filter condition
- language_id_score_filter: # filter text in specific language with language scores larger than a specific max value
lang: en # keep text in what language
min_score: 0.8 # the min language scores to filter text
Expand Down
5 changes: 3 additions & 2 deletions data_juicer/ops/filter/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from . import (alphanumeric_filter, average_line_length_filter,
character_repetition_filter, flagged_words_filter,
image_aspect_ratio_filter, language_id_score_filter,
character_repetition_filter, clip_similarity_filter,
flagged_words_filter, image_aspect_ratio_filter,
image_size_filter, language_id_score_filter,
maximum_line_length_filter, perplexity_filter,
special_characters_filter, specified_field_filter,
specified_numeric_field_filter, stopwords_filter, suffix_filter,
Expand Down
158 changes: 158 additions & 0 deletions data_juicer/ops/filter/clip_similarity_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
import numpy as np
import torch
from jsonargparse.typing import ClosedUnitInterval

from data_juicer.utils.constant import Fields, StatsKeys
from data_juicer.utils.mm_utils import SpecialTokens, load_image
from data_juicer.utils.model_utils import get_model, prepare_model

from ..base_op import OPERATORS, Filter
from ..op_fusion import LOADED_IMAGES

# avoid hanging when calling clip in multiprocessing
torch.set_num_threads(1)


@OPERATORS.register_module('clip_similarity_filter')
@LOADED_IMAGES.register_module('clip_similarity_filter')
class ClipSimilarityFilter(Filter):
"""Filter to keep samples those similarity between image and text
within a specific range."""

def __init__(self,
hf_clip='openai/clip-vit-base-patch32',
min_score: ClosedUnitInterval = 0.1,
max_score: ClosedUnitInterval = 1.0,
any_or_all: str = 'any',
reduce_mode: str = 'avg',
*args,
**kwargs):
"""
Initialization method.
:param hf_clip: clip model name on huggingface to compute
the similarity between image and text.
:param min_score: The min similarity to keep samples.
:param max_score: The max similarity to keep samples.
:param any_or_all: keep this sample with 'any' or 'all' strategy of
all images. 'any': keep this sample if any images meet the
condition. 'all': keep this sample only if all images meet the
condition.
:param reduce_mode: reduce mode when one text corresponds to
multiple images in a chunk.
'avg': Take the average of multiple values
'max': Take the max of multiple values
'min': Take the min of multiple values
:param args: extra args
:param kwargs: extra args
"""
super().__init__(*args, **kwargs)
self.min_score = min_score
self.max_score = max_score
if reduce_mode not in ['avg', 'max', 'min']:
raise ValueError(f'Reduce mode [{reduce_mode}] is not supported. '
f'Can only be one of ["avg", "max", "min"].')
if any_or_all not in ['any', 'all']:
raise ValueError(f'Keep strategy [{any_or_all}] is not supported. '
f'Can only be one of ["any", "all"].')
self.any = (any_or_all == 'any')
self.model_key = prepare_model(model_type='hf_clip', model_key=hf_clip)
self.reduce_mode = reduce_mode

def compute_stats(self, sample, context=False):
# check if it's computed already
if StatsKeys.clip_image_text_similarity in sample[Fields.stats]:
return sample

# there is no image in this sample
if self.image_key not in sample or not sample[self.image_key]:
sample[Fields.stats][
StatsKeys.clip_image_text_similarity] = np.array(
[], dtype=np.float64)
return sample

# load images
loaded_image_keys = sample[self.image_key]
images = {}
for loaded_image_key in loaded_image_keys:
if context and loaded_image_key in sample[Fields.context]:
# load from context
images[loaded_image_key] = sample[
Fields.context][loaded_image_key]
else:
if loaded_image_key not in images:
# avoid load the same images
image = load_image(loaded_image_key)
images[loaded_image_key] = image
if context:
# store the image data into context
sample[Fields.context][loaded_image_key] = image

text = sample[self.text_key]
special_token_dict = {
key: value
for key, value in SpecialTokens.__dict__.items()
if not key.startswith('__')
}
offset = 0

def remove_special_token(text):
for value in special_token_dict.values():
text = text.replace(value, '')
return text

similarity = []
model, processor = get_model(self.model_key)

for chunk in text.split(SpecialTokens.eoc):
count = chunk.count(SpecialTokens.image)

# no image or no text
if count == 0 or len(chunk) == 0:
continue
else:
text_chunk = remove_special_token(chunk)
image_chunk = [
images[image_key]
for image_key in loaded_image_keys[offset:offset + count]
]

inputs = processor(text=text_chunk,
images=image_chunk,
return_tensors='pt',
truncation=True,
max_length=model.config.text_config.
max_position_embeddings,
padding=True)

outputs = model(**inputs)
chunk_logits = outputs.logits_per_text.detach().cpu() / 100.0

if self.reduce_mode == 'avg':
chunk_similarity = chunk_logits.mean()
elif self.reduce_mode == 'max':
chunk_similarity = chunk_logits.max()
else:
chunk_similarity = chunk_logits.min()

similarity.append(float(chunk_similarity))
offset += count
sample[Fields.stats][StatsKeys.clip_image_text_similarity] = similarity

return sample

def process(self, sample):
similarity = sample[Fields.stats][StatsKeys.clip_image_text_similarity]
if len(similarity) <= 0:
return True

keep_bools = np.array([
self.min_score <= sim_value <= self.max_score
for sim_value in similarity
])

# different strategies
if self.any:
return keep_bools.any()
else:
return keep_bools.all()
74 changes: 74 additions & 0 deletions data_juicer/ops/filter/image_size_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import numpy as np

from data_juicer.utils.constant import Fields, StatsKeys
from data_juicer.utils.mm_utils import get_image_size, size_to_bytes

from ..base_op import OPERATORS, Filter


@OPERATORS.register_module('image_size_filter')
class ImageSizeFilter(Filter):
"""Keep data samples whose image size (in bytes/kb/MB/...) within a
specific range.
"""

def __init__(self,
min_size: str = '0',
max_size: str = '1TB',
any_or_all: str = 'any',
*args,
**kwargs):
"""
Initialization method.
:param min_size: The min image size to keep samples. set to be "0" by
default for no size constraint
:param max_size: The max image size to keep samples. set to be
"1Tb" by default, an approximate for un-limited case
:param any_or_all: keep this sample with 'any' or 'all' strategy of
all images. 'any': keep this sample if any images meet the
condition. 'all': keep this sample only if all images meet the
condition.
:param args: extra args
:param kwargs: extra args
"""
super().__init__(*args, **kwargs)
self.min_size = size_to_bytes(min_size)
self.max_size = size_to_bytes(max_size)
if any_or_all not in ['any', 'all']:
raise ValueError(f'Keep strategy [{any_or_all}] is not supported. '
f'Can only be one of ["any", "all"].')
self.any = (any_or_all == 'any')

def compute_stats(self, sample, context=False):
# check if it's computed already
if StatsKeys.image_sizes in sample[Fields.stats]:
return sample

# there is no image in this sample
if self.image_key not in sample or not sample[self.image_key]:
sample[Fields.stats][StatsKeys.image_sizes] = np.array(
[], dtype=np.float64)
return sample

# for size calculation, no need to load images into memory
sample[Fields.stats][StatsKeys.image_sizes] = [
get_image_size(img_path) for img_path in sample[self.image_key]
]

return sample

def process(self, sample):
image_sizes = sample[Fields.stats][StatsKeys.image_sizes]
keep_bools = np.array([
self.min_size <= image_size <= self.max_size
for image_size in image_sizes
])
if len(keep_bools) <= 0:
return True

# different strategies
if self.any:
return keep_bools.any()
else:
return keep_bools.all()
4 changes: 4 additions & 0 deletions data_juicer/utils/constant.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ class StatsKeys(object):

# image
aspect_ratios = 'aspect_ratios'
image_sizes = 'image_sizes'

# multimodal
clip_image_text_similarity = 'clip_image_text_similarity'


class HashKeys(object):
Expand Down
44 changes: 44 additions & 0 deletions data_juicer/utils/mm_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,47 @@ def load_image(path):
img_feature = Image()
img = img_feature.decode_example(img_feature.encode_example(path))
return img


def get_image_size(path):
import os
return os.path.getsize(path)


def size_to_bytes(size):
alphabets_list = [char for char in size if char.isalpha()]
numbers_list = [char for char in size if char.isdigit()]

if len(numbers_list) == 0:
raise ValueError(f'Your input `size` does not contain numbers: {size}')

size_numbers = int(float(''.join(numbers_list)))

if len(alphabets_list) == 0:
# by default, if users do not specify the units, the number will be
# regarded as in bytes
return size_numbers

suffix = ''.join(alphabets_list).lower()

if suffix == 'kb' or suffix == 'kib':
return size_numbers << 10
elif suffix == 'mb' or suffix == 'mib':
return size_numbers << 20
elif suffix == 'gb' or suffix == 'gib':
return size_numbers << 30
elif suffix == 'tb' or suffix == 'tib':
return size_numbers << 40
elif suffix == 'pb' or suffix == 'pib':
return size_numbers << 50
elif suffix == 'eb' or suffix == 'eib':
return size_numbers << 60
elif suffix == 'zb' or suffix == 'zib':
return size_numbers << 70
elif suffix == 'yb' or suffix == 'yib':
return size_numbers << 80
else:
raise ValueError(f'You specified unidentifiable unit: {suffix}, '
f'expected in [KB, MB, GB, TB, PB, EB, ZB, YB, '
f'KiB, MiB, GiB, TiB, PiB, EiB, ZiB, YiB], '
f'(case insensitive, counted by *Bytes*).')
22 changes: 22 additions & 0 deletions data_juicer/utils/model_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,22 @@ def prepare_huggingface_tokenizer(tokenizer_name):
return tokenizer


def prepare_huggingface_clip(clip_name):
"""
Prepare and load a clip and processor from HuggingFace.
:param clip_name: input clip name
:return: a pair of clip instance and processor instance.
"""
from transformers import CLIPModel, CLIPProcessor

model = CLIPModel.from_pretrained(clip_name)
processor = CLIPProcessor.from_pretrained(clip_name)
logger.info('Loading clip and processor from HuggingFace...')

return (model, processor)


def prepare_diversity_model(model_name, lang):
"""
Prepare diversity model for specific language.
Expand Down Expand Up @@ -222,6 +238,7 @@ def prepare_model(lang='en', model_type='sentencepiece', model_key=None):
'kenlm': ('%s.arpa.bin', prepare_kenlm_model),
'nltk': ('punkt.%s.pickle', prepare_nltk_model),
'huggingface': ('%s', prepare_huggingface_tokenizer),
'hf_clip': ('%s', prepare_huggingface_clip),
'spacy': ('%s_core_web_md-3.5.0', prepare_diversity_model),
}
assert model_type in type_to_name.keys(
Expand All @@ -236,6 +253,11 @@ def prepare_model(lang='en', model_type='sentencepiece', model_key=None):
MODEL_ZOO[model_key] = model_func(model_name)
elif model_type == 'huggingface':
MODEL_ZOO[model_key] = model_func(model_key)
elif model_type == 'hf_clip':
new_model_key = model_type + model_key
if new_model_key not in MODEL_ZOO.keys():
MODEL_ZOO[new_model_key] = model_func(model_key)
model_key = new_model_key
else:
MODEL_ZOO[model_key] = model_func(model_name, lang)
return model_key
Expand Down
4 changes: 3 additions & 1 deletion demos/overview_scan/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@
|-----------------------------------|:------:|-------------------------------------------------|
| Formatter | 7 | Discovers, loads, and canonicalizes source data |
| Mapper | 21 | Edits and transforms samples |
| Filter | 17 | Filters out low-quality samples |
| Filter | 19 | Filters out low-quality samples |
| Deduplicator | 4 | Detects and removes duplicate samples |
| Selector | 2 | Selects top samples based on ranking |
'''
Expand Down Expand Up @@ -140,8 +140,10 @@
| alphanumeric_filter | General | en, zh | Keeps samples with alphanumeric ratio within the specified range |
| average_line_length_filter | Code | en, zh | Keeps samples with average line length within the specified range |
| character_repetition_filter | General | en, zh | Keeps samples with char-level n-gram repetition ratio within the specified range |
| clip_similarity_filter | Multimodal | - | Keeps samples with similarity between text and images within the specified range |
| flagged_words_filter | General | en, zh | Keeps samples with flagged-word ratio below the specified threshold |
| image_aspect_ratio_filter | Image | - | Keeps samples contains images with aspect ratios within specific range |
| image_size_filter | Image | - | Keeps samples contains images whose size in bytes are within specific range |
| language_id_score_filter | General | en, zh | Keeps samples of the specified language, judged by a predicted confidence score |
| maximum_line_length_filter | Code | en, zh | Keeps samples with maximum line length within the specified range |
| perplexity_filter | General | en, zh | Keeps samples with perplexity score below the specified threshold |
Expand Down
Loading

0 comments on commit 85e8f42

Please sign in to comment.