fix conflicts

modelscope · Nov 15, 2023 · 85e8f42 · 85e8f42
2 parents 5580fc9 + 985e475
commit 85e8f42
Show file tree

Hide file tree

Showing 14 changed files with 657 additions and 5 deletions.
diff --git a/configs/config_all.yaml b/configs/config_all.yaml
@@ -108,6 +108,12 @@ process:
       rep_len: 10                                             # repetition length for char-level n-gram
       min_ratio: 0.0                                          # the min ratio of filter range
       max_ratio: 0.5                                          # the max ratio of filter range
+  - clip_similarity_filter:                                 # filter samples according to the similarity between text and images.
+      hf_clip: openai/clip-vit-base-patch32                   # name of used Hugging Face clip
+      min_score: 0.1                                          # the min similarity of filter range
+      max_score: 1.0                                          # the max similarity of filter range
+      reduce_mode: avg                                        # reduce mode when one text corresponds to multiple images in a chunk,  must be one of ['avg','max', 'min'].
+      any_or_all: any                                         # keep this sample when any/all images meet the filter condition
   - flagged_words_filter:                                   # filter text with the flagged-word ratio larger than a specific max value
       lang: en                                                # consider flagged words in what language
       tokenization: false                                     # whether to use model to tokenize documents
@@ -120,6 +126,10 @@ process:
       min_ratio: 0.333                                        # the min aspect ratio of filter range
       max_ratio: 3.0                                          # the max aspect ratio of filter range
       any_or_all: any                                         # keep this sample when any/all images meet the filter condition
+  - image_size_filter:                                      # filter samples according to the size of images (in bytes) within them
+      min_size:  "0"                                            # the min size of filter range
+      max_size: "1TB"                                          # the max size of filter range
+      any_or_all: any                                           # keep this sample when any/all images meet the filter condition
   - language_id_score_filter:                               # filter text in specific language with language scores larger than a specific max value
       lang: en                                                # keep text in what language
       min_score: 0.8                                          # the min language scores to filter text

diff --git a/data_juicer/ops/filter/__init__.py b/data_juicer/ops/filter/__init__.py
@@ -1,6 +1,7 @@
 from . import (alphanumeric_filter, average_line_length_filter,
-               character_repetition_filter, flagged_words_filter,
-               image_aspect_ratio_filter, language_id_score_filter,
+               character_repetition_filter, clip_similarity_filter,
+               flagged_words_filter, image_aspect_ratio_filter,
+               image_size_filter, language_id_score_filter,
                maximum_line_length_filter, perplexity_filter,
                special_characters_filter, specified_field_filter,
                specified_numeric_field_filter, stopwords_filter, suffix_filter,

diff --git a/data_juicer/ops/filter/clip_similarity_filter.py b/data_juicer/ops/filter/clip_similarity_filter.py
@@ -0,0 +1,158 @@
+import numpy as np
+import torch
+from jsonargparse.typing import ClosedUnitInterval
+
+from data_juicer.utils.constant import Fields, StatsKeys
+from data_juicer.utils.mm_utils import SpecialTokens, load_image
+from data_juicer.utils.model_utils import get_model, prepare_model
+
+from ..base_op import OPERATORS, Filter
+from ..op_fusion import LOADED_IMAGES
+
+# avoid hanging when calling clip in multiprocessing
+torch.set_num_threads(1)
+
+
+@OPERATORS.register_module('clip_similarity_filter')
+@LOADED_IMAGES.register_module('clip_similarity_filter')
+class ClipSimilarityFilter(Filter):
+    """Filter to keep samples those similarity between image and text
+    within a specific range."""
+
+    def __init__(self,
+                 hf_clip='openai/clip-vit-base-patch32',
+                 min_score: ClosedUnitInterval = 0.1,
+                 max_score: ClosedUnitInterval = 1.0,
+                 any_or_all: str = 'any',
+                 reduce_mode: str = 'avg',
+                 *args,
+                 **kwargs):
+        """
+        Initialization method.
+
+        :param hf_clip: clip model name on huggingface to compute
+            the similarity between image and text.
+        :param min_score: The min similarity to keep samples.
+        :param max_score: The max similarity to keep samples.
+        :param any_or_all: keep this sample with 'any' or 'all' strategy of
+            all images. 'any': keep this sample if any images meet the
+            condition. 'all': keep this sample only if all images meet the
+            condition.
+        :param reduce_mode: reduce mode when one text corresponds to
+            multiple images in a chunk.
+            'avg': Take the average of multiple values
+            'max': Take the max of multiple values
+            'min': Take the min of multiple values
+        :param args: extra args
+        :param kwargs: extra args
+        """
+        super().__init__(*args, **kwargs)
+        self.min_score = min_score
+        self.max_score = max_score
+        if reduce_mode not in ['avg', 'max', 'min']:
+            raise ValueError(f'Reduce mode [{reduce_mode}] is not supported. '
+                             f'Can only be one of ["avg", "max", "min"].')
+        if any_or_all not in ['any', 'all']:
+            raise ValueError(f'Keep strategy [{any_or_all}] is not supported. '
+                             f'Can only be one of ["any", "all"].')
+        self.any = (any_or_all == 'any')
+        self.model_key = prepare_model(model_type='hf_clip', model_key=hf_clip)
+        self.reduce_mode = reduce_mode
+
+    def compute_stats(self, sample, context=False):
+        # check if it's computed already
+        if StatsKeys.clip_image_text_similarity in sample[Fields.stats]:
+            return sample
+
+        # there is no image in this sample
+        if self.image_key not in sample or not sample[self.image_key]:
+            sample[Fields.stats][
+                StatsKeys.clip_image_text_similarity] = np.array(
+                    [], dtype=np.float64)
+            return sample
+
+        # load images
+        loaded_image_keys = sample[self.image_key]
+        images = {}
+        for loaded_image_key in loaded_image_keys:
+            if context and loaded_image_key in sample[Fields.context]:
+                # load from context
+                images[loaded_image_key] = sample[
+                    Fields.context][loaded_image_key]
+            else:
+                if loaded_image_key not in images:
+                    # avoid load the same images
+                    image = load_image(loaded_image_key)
+                    images[loaded_image_key] = image
+                    if context:
+                        # store the image data into context
+                        sample[Fields.context][loaded_image_key] = image
+
+        text = sample[self.text_key]
+        special_token_dict = {
+            key: value
+            for key, value in SpecialTokens.__dict__.items()
+            if not key.startswith('__')
+        }
+        offset = 0
+
+        def remove_special_token(text):
+            for value in special_token_dict.values():
+                text = text.replace(value, '')
+            return text
+
+        similarity = []
+        model, processor = get_model(self.model_key)
+
+        for chunk in text.split(SpecialTokens.eoc):
+            count = chunk.count(SpecialTokens.image)
+
+            # no image or no text
+            if count == 0 or len(chunk) == 0:
+                continue
+            else:
+                text_chunk = remove_special_token(chunk)
+                image_chunk = [
+                    images[image_key]
+                    for image_key in loaded_image_keys[offset:offset + count]
+                ]
+
+                inputs = processor(text=text_chunk,
+                                   images=image_chunk,
+                                   return_tensors='pt',
+                                   truncation=True,
+                                   max_length=model.config.text_config.
+                                   max_position_embeddings,
+                                   padding=True)
+
+                outputs = model(**inputs)
+                chunk_logits = outputs.logits_per_text.detach().cpu() / 100.0
+
+                if self.reduce_mode == 'avg':
+                    chunk_similarity = chunk_logits.mean()
+                elif self.reduce_mode == 'max':
+                    chunk_similarity = chunk_logits.max()
+                else:
+                    chunk_similarity = chunk_logits.min()
+
+                similarity.append(float(chunk_similarity))
+            offset += count
+        sample[Fields.stats][StatsKeys.clip_image_text_similarity] = similarity
+
+        return sample
+
+    def process(self, sample):
+        similarity = sample[Fields.stats][StatsKeys.clip_image_text_similarity]
+        if len(similarity) <= 0:
+            return True
+
+        keep_bools = np.array([
+            self.min_score <= sim_value <= self.max_score
+            for sim_value in similarity
+        ])
+
+        # different strategies
+        if self.any:
+            return keep_bools.any()
+        else:
+            return keep_bools.all()
diff --git a/data_juicer/ops/filter/image_size_filter.py b/data_juicer/ops/filter/image_size_filter.py
@@ -0,0 +1,74 @@
+import numpy as np
+
+from data_juicer.utils.constant import Fields, StatsKeys
+from data_juicer.utils.mm_utils import get_image_size, size_to_bytes
+
+from ..base_op import OPERATORS, Filter
+
+
+@OPERATORS.register_module('image_size_filter')
+class ImageSizeFilter(Filter):
+    """Keep data samples whose image size (in bytes/kb/MB/...) within a
+    specific range.
+    """
+
+    def __init__(self,
+                 min_size: str = '0',
+                 max_size: str = '1TB',
+                 any_or_all: str = 'any',
+                 *args,
+                 **kwargs):
+        """
+        Initialization method.
+
+        :param min_size: The min image size to keep samples.  set to be "0" by
+        default for no size constraint
+        :param max_size: The max image size to keep samples.  set to be
+        "1Tb" by default, an approximate for un-limited case
+        :param any_or_all: keep this sample with 'any' or 'all' strategy of
+            all images. 'any': keep this sample if any images meet the
+            condition. 'all': keep this sample only if all images meet the
+            condition.
+        :param args: extra args
+        :param kwargs: extra args
+        """
+        super().__init__(*args, **kwargs)
+        self.min_size = size_to_bytes(min_size)
+        self.max_size = size_to_bytes(max_size)
+        if any_or_all not in ['any', 'all']:
+            raise ValueError(f'Keep strategy [{any_or_all}] is not supported. '
+                             f'Can only be one of ["any", "all"].')
+        self.any = (any_or_all == 'any')
+
+    def compute_stats(self, sample, context=False):
+        # check if it's computed already
+        if StatsKeys.image_sizes in sample[Fields.stats]:
+            return sample
+
+        # there is no image in this sample
+        if self.image_key not in sample or not sample[self.image_key]:
+            sample[Fields.stats][StatsKeys.image_sizes] = np.array(
+                [], dtype=np.float64)
+            return sample
+
+        # for size calculation, no need to load images into memory
+        sample[Fields.stats][StatsKeys.image_sizes] = [
+            get_image_size(img_path) for img_path in sample[self.image_key]
+        ]
+
+        return sample
+
+    def process(self, sample):
+        image_sizes = sample[Fields.stats][StatsKeys.image_sizes]
+        keep_bools = np.array([
+            self.min_size <= image_size <= self.max_size
+            for image_size in image_sizes
+        ])
+        if len(keep_bools) <= 0:
+            return True
+
+        # different strategies
+        if self.any:
+            return keep_bools.any()
+        else:
+            return keep_bools.all()
diff --git a/data_juicer/utils/constant.py b/data_juicer/utils/constant.py
@@ -28,6 +28,10 @@ class StatsKeys(object):
 
     # image
     aspect_ratios = 'aspect_ratios'
+    image_sizes = 'image_sizes'
+
+    # multimodal
+    clip_image_text_similarity = 'clip_image_text_similarity'
 
 
 class HashKeys(object):

diff --git a/data_juicer/utils/mm_utils.py b/data_juicer/utils/mm_utils.py
@@ -21,3 +21,47 @@ def load_image(path):
     img_feature = Image()
     img = img_feature.decode_example(img_feature.encode_example(path))
     return img
+
+
+def get_image_size(path):
+    import os
+    return os.path.getsize(path)
+
+
+def size_to_bytes(size):
+    alphabets_list = [char for char in size if char.isalpha()]
+    numbers_list = [char for char in size if char.isdigit()]
+
+    if len(numbers_list) == 0:
+        raise ValueError(f'Your input `size` does not contain numbers: {size}')
+
+    size_numbers = int(float(''.join(numbers_list)))
+
+    if len(alphabets_list) == 0:
+        # by default, if users do not specify the units, the number will be
+        # regarded as in bytes
+        return size_numbers
+
+    suffix = ''.join(alphabets_list).lower()
+
+    if suffix == 'kb' or suffix == 'kib':
+        return size_numbers << 10
+    elif suffix == 'mb' or suffix == 'mib':
+        return size_numbers << 20
+    elif suffix == 'gb' or suffix == 'gib':
+        return size_numbers << 30
+    elif suffix == 'tb' or suffix == 'tib':
+        return size_numbers << 40
+    elif suffix == 'pb' or suffix == 'pib':
+        return size_numbers << 50
+    elif suffix == 'eb' or suffix == 'eib':
+        return size_numbers << 60
+    elif suffix == 'zb' or suffix == 'zib':
+        return size_numbers << 70
+    elif suffix == 'yb' or suffix == 'yib':
+        return size_numbers << 80
+    else:
+        raise ValueError(f'You specified unidentifiable unit: {suffix}, '
+                         f'expected in [KB, MB, GB, TB, PB, EB, ZB, YB, '
+                         f'KiB, MiB, GiB, TiB, PiB, EiB, ZiB, YiB], '
+                         f'(case insensitive, counted by *Bytes*).')
diff --git a/data_juicer/utils/model_utils.py b/data_juicer/utils/model_utils.py
@@ -170,6 +170,22 @@ def prepare_huggingface_tokenizer(tokenizer_name):
     return tokenizer
 
 
+def prepare_huggingface_clip(clip_name):
+    """
+    Prepare and load a clip and processor from HuggingFace.
+
+    :param clip_name: input clip name
+    :return: a pair of clip instance and processor instance.
+    """
+    from transformers import CLIPModel, CLIPProcessor
+
+    model = CLIPModel.from_pretrained(clip_name)
+    processor = CLIPProcessor.from_pretrained(clip_name)
+    logger.info('Loading clip and processor from HuggingFace...')
+
+    return (model, processor)
+
+
 def prepare_diversity_model(model_name, lang):
     """
     Prepare diversity model for specific language.
@@ -222,6 +238,7 @@ def prepare_model(lang='en', model_type='sentencepiece', model_key=None):
         'kenlm': ('%s.arpa.bin', prepare_kenlm_model),
         'nltk': ('punkt.%s.pickle', prepare_nltk_model),
         'huggingface': ('%s', prepare_huggingface_tokenizer),
+        'hf_clip': ('%s', prepare_huggingface_clip),
         'spacy': ('%s_core_web_md-3.5.0', prepare_diversity_model),
     }
     assert model_type in type_to_name.keys(
@@ -236,6 +253,11 @@ def prepare_model(lang='en', model_type='sentencepiece', model_key=None):
             MODEL_ZOO[model_key] = model_func(model_name)
         elif model_type == 'huggingface':
             MODEL_ZOO[model_key] = model_func(model_key)
+        elif model_type == 'hf_clip':
+            new_model_key = model_type + model_key
+            if new_model_key not in MODEL_ZOO.keys():
+                MODEL_ZOO[new_model_key] = model_func(model_key)
+            model_key = new_model_key
         else:
             MODEL_ZOO[model_key] = model_func(model_name, lang)
     return model_key

diff --git a/demos/overview_scan/app.py b/demos/overview_scan/app.py
@@ -89,7 +89,7 @@
 |-----------------------------------|:------:|-------------------------------------------------|
 | Formatter         |   7    | Discovers, loads, and canonicalizes source data |
 | Mapper            |   21   | Edits and transforms samples                    |
-| Filter            |   17   | Filters out low-quality samples                 |
+| Filter            |   19   | Filters out low-quality samples                 |
 | Deduplicator      |   4    | Detects and removes duplicate samples           |
 | Selector          |   2    | Selects top samples based on ranking            |
 '''
@@ -140,8 +140,10 @@
 | alphanumeric_filter            | General | en, zh | Keeps samples with alphanumeric ratio within the specified range                           |
 | average_line_length_filter     | Code    | en, zh | Keeps samples with average line length within the specified range                          |
 | character_repetition_filter    | General | en, zh | Keeps samples with char-level n-gram repetition ratio within the specified range           |
+| clip_similarity_filter         | Multimodal |   -    |  Keeps samples with similarity between text and images within the specified range       |
 | flagged_words_filter           | General | en, zh | Keeps samples with flagged-word ratio below the specified threshold                        |
 | image_aspect_ratio_filter      | Image   |   -    | Keeps samples contains images with aspect ratios within specific range                     |
+| image_size_filter      | Image   |   -    | Keeps samples contains images whose size in bytes are within specific range                     |
 | language_id_score_filter       | General | en, zh | Keeps samples of the specified language, judged by a predicted confidence score            |
 | maximum_line_length_filter     | Code    | en, zh | Keeps samples with maximum line length within the specified range                          |
 | perplexity_filter              | General | en, zh | Keeps samples with perplexity score below the specified threshold                          |