merge main

modelscope · Sep 19, 2024 · faa03b8 · faa03b8
2 parents 3c01fb9 + c40a308
commit faa03b8
Show file tree

Hide file tree

Showing 123 changed files with 1,166 additions and 558 deletions.
diff --git a/configs/config_all.yaml b/configs/config_all.yaml
@@ -122,6 +122,8 @@ process:
       cv_classifier: ''                                       # OpenCV classifier path for face detection. By default, we will use 'haarcascade_frontalface_alt.xml'.
       blur_type: 'gaussian'                                   # type of blur kernel, including ['mean', 'box', 'gaussian']
       radius: 2                                               # radius of blur kernel
+  - image_tagging_mapper:                                   # Mapper to generate image tags.
+      tag_field_name: '__dj__image_tags__'                    # the field name to store the tags. It's "__dj__image_tags__" in default.
   - nlpaug_en_mapper:                                       # simply augment texts in English based on the nlpaug library
       sequential: false                                       # whether combine all augmentation methods to a sequence. If it's True, a sample will be augmented by all opened augmentation methods sequentially. If it's False, each opened augmentation method would generate its augmented samples independently.
       aug_num: 1                                              # number of augmented samples to be generated. If `sequential` is True, there will be total aug_num augmented samples generated. If it's False, there will be (aug_num * #opened_aug_method) augmented samples generated.
@@ -258,10 +260,12 @@ process:
       show_progress: false                                    # whether to show progress from scenedetect
   - video_tagging_from_audio_mapper:                        # Mapper to generate video tags from audio streams extracted from the video.
       hf_ast: 'MIT/ast-finetuned-audioset-10-10-0.4593'       # Huggingface model name for the audio classification model.
+      tag_field_name: '__dj__video_audio_tags__'              # the field name to store the tags. It's "__dj__video_audio_tags__" in default.
       mem_required: '500MB'                                   # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
   - video_tagging_from_frames_mapper:                       # Mapper to generate video tags from frames extracted from the video.
       frame_sampling_method: 'all_keyframes'                  # sampling method of extracting frame images from the videos. Should be one of ["all_keyframes", "uniform"]. The former one extracts all key frames and the latter one extract specified number of frames uniformly from the video. Default: "all_keyframes".
       frame_num: 3                                            # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
+      tag_field_name: '__dj__video_frame_tags__'              # the field name to store the tags. It's "__dj__video_frame_tags__" in default.
   - whitespace_normalization_mapper:                        # normalize different kinds of whitespaces to English whitespace.
 
   # Filter ops
@@ -316,6 +320,11 @@ process:
       score_threshold: 0.5                                    # the nsfw score threshold for samples, range from 0 to 1. Samples with nsfw score less than this threshold will be kept.
       any_or_all: any                                         # keep this sample when any/all images meet the filter condition
       mem_required: '1GB'                                     # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
+  - image_pair_similarity_filter:                           # filter samples according to the similarity score between the image pair.
+      hf_clip: 'openai/clip-vit-base-patch32'                 # model name of the CLIP model on huggingface
+      min_score: 0.1                                          # the min similarity score of filter range
+      max_score: 1.0                                          # the max similarity score of filter range
+      any_or_all: "any"                                       # keep this sample when any/all images meet the filter condition
   - image_shape_filter:                                     # filter samples according to the widths and heights of images in them
       min_width: 200                                          # the min width of width filter range
       max_width: 5000                                         # the max width of width filter range
@@ -473,6 +482,7 @@ process:
       contain: any                                            # require the videos containing 'any' or 'all' given tags. When tags equal to [], 'all' keeps all samples, 'any' keeps no sample.
       frame_sampling_method: all_keyframes                    # sampling method of extracting frame images from the videos. Should be one of ["all_keyframes", "uniform"]. The former one extracts all key frames and the latter one extract specified number of frames uniformly from the video. Default: "all_keyframes".
       frame_num: 3                                            # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
+      tag_field_name: '__dj__video_frame_tags__'              # the field name to store the tags. It's "__dj__video_frame_tags__" in default.
       any_or_all: any                                         # keep this sample when any/all videos meet the filter condition
   - words_num_filter:                                       # filter text with number of words out of specific range
       lang: en                                                # sample in which language

diff --git a/configs/data_juicer_recipes/general-video-refine-example.yaml b/configs/data_juicer_recipes/general-video-refine-example.yaml
@@ -47,7 +47,7 @@ process:
   - video_motion_score_filter:                              # Keep samples with video motion scores within a specific range.
       min_score: 0.25                                         # the minimum motion score to keep samples
       max_score: 10000.0                                      # the maximum motion score to keep samples
-      sampling_fps: 2                                         # the samplig rate of frames_per_second to compute optical flow
+      sampling_fps: 2                                         # the sampling rate of frames_per_second to compute optical flow
       any_or_all: any                                         # keep this sample when any/all videos meet the filter condition
   - video_nsfw_filter:                                      # filter samples according to the nsfw scores of videos in them
       hf_nsfw_model: Falconsai/nsfw_image_detection           # Huggingface model name for nsfw classification

diff --git a/configs/demo/bench/model_train.yaml b/configs/demo/bench/model_train.yaml
@@ -13,7 +13,7 @@ train:
     # https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/dj-competition/modelscope_sora/models/easyanimate_mm_16x256x256_pretrain.safetensors
     transformer_path: "/PATH/TO/EASYANIMATE_MODEL"
   dataset_path:
-    # The root diretory to videos. Set empty if it is the absolute path in the dataset.
+    # The root directory to videos. Set empty if it is the absolute path in the dataset.
     dataset_name: ""
     # path to the Data-Juicer dataset. Note that the root path is in "thirdparth/models/EasyAnimate"
     dataset_meta_name: "../../../outputs/demo-bench/demo-dataset-for-train.jsonl"

diff --git a/configs/demo/bench/model_train_2_epoch.yaml b/configs/demo/bench/model_train_2_epoch.yaml
@@ -13,7 +13,7 @@ train:
     # https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/dj-competition/modelscope_sora/models/easyanimate_mm_16x256x256_pretrain.safetensors
     transformer_path: "/PATH/TO/EASYANIMATE_MODEL"
   dataset_path:
-    # The root diretory to videos. Set empty if it is the absolute path in the dataset.
+    # The root directory to videos. Set empty if it is the absolute path in the dataset.
     dataset_name: ""
     # path to the Data-Juicer dataset. Note that the root path is in "thirdparth/easy_animate"
     dataset_meta_name: "../../../outputs/demo-bench/demo-dataset-with-multi-op-stats.jsonl"

diff --git a/configs/demo/sandbox/inception_eval_config.yaml b/configs/demo/sandbox/inception_eval_config.yaml
@@ -4,9 +4,9 @@ type: video_inception_evaluator
 fake_data_path: /path/to/the/generated/dj_format_dataset
 # The path to ground truth dataset. Only support for `jsonl` format. The video paths are put in the list under `videos` keys. Required when computing FVD, FID, KID, and PR.
 real_data_path: /path/to/the/groundtruth/dj_format_dataset
-# The root diretory to store the generated videos. If it is not none, the paths in jsonl file at fake_data_path are relative paths on it, else are absolute path.
+# The root directory to store the generated videos. If it is not none, the paths in jsonl file at fake_data_path are relative paths on it, else are absolute path.
 fake_mm_dir: null
-# The root diretory to store the real videos. If it is not none, the paths in jsonl file at real_data_path are relative paths on it, else are absolute path.
+# The root directory to store the real videos. If it is not none, the paths in jsonl file at real_data_path are relative paths on it, else are absolute path.
 real_mm_dir: null
 # Path to the corresponding detection model. Download the model from web if it is None.
 detector_path: null

diff --git a/data_juicer/config/config.py b/data_juicer/config/config.py
@@ -5,7 +5,7 @@
 import tempfile
 import time
 from argparse import ArgumentError, Namespace
-from typing import Dict, List, Tuple, Union
+from typing import Dict, List, Union
 
 import yaml
 from jsonargparse import (ActionConfigFile, ArgumentParser, dict_to_namespace,
@@ -194,11 +194,17 @@ def init_configs(args=None):
         'own special token according to your input dataset.')
     parser.add_argument(
         '--suffixes',
-        type=Union[str, List[str], Tuple[str]],
+        type=Union[str, List[str]],
         default=[],
         help='Suffixes of files that will be find and loaded. If not set, we '
         'will find all suffix files, and select a suitable formatter '
         'with the most files as default.')
+    parser.add_argument(
+        '--turbo',
+        type=bool,
+        default=False,
+        help='Enable Turbo mode to maximize processing speed. Stability '
+        'features like fault tolerance will be disabled.')
     parser.add_argument(
         '--use_cache',
         type=bool,
@@ -470,6 +476,8 @@ def init_setup_from_cfg(cfg):
                     'image_key': cfg.image_key,
                     'audio_key': cfg.audio_key,
                     'video_key': cfg.video_key,
+                    'num_proc': cfg.np,
+                    'turbo': cfg.turbo,
                 }
             else:
                 if 'text_key' not in args or args['text_key'] is None:
@@ -480,6 +488,10 @@ def init_setup_from_cfg(cfg):
                     args['audio_key'] = cfg.audio_key
                 if 'video_key' not in args or args['video_key'] is None:
                     args['video_key'] = cfg.video_key
+                if 'num_proc' not in args or args['num_proc'] is None:
+                    args['num_proc'] = cfg.np
+                if 'turbo' not in args or args['turbo'] is None:
+                    args['turbo'] = cfg.turbo
             op[op_name] = args
 
     return cfg
@@ -574,14 +586,12 @@ def update_op_process(cfg, parser):
 
         # update op params of cfg.process
         internal_op_para = temp_cfg.get(op_in_process_name)
-        if internal_op_para is not None:
-            num_proc = internal_op_para.get('num_proc')
-            if 'num_proc' in internal_op_para:
-                internal_op_para['num_proc'] = num_proc or cfg.np
-            internal_op_para = namespace_to_dict(internal_op_para)
-        else:
-            internal_op_para = None
-        cfg.process[i] = {op_in_process_name: internal_op_para}
+
+        cfg.process[i] = {
+            op_in_process_name:
+            None if internal_op_para is None else
+            namespace_to_dict(internal_op_para)
+        }
 
     # check the op params via type hint
     temp_parser = copy.deepcopy(parser)

diff --git a/data_juicer/core/analyzer.py b/data_juicer/core/analyzer.py
@@ -39,9 +39,12 @@ def __init__(self, cfg=None):
 
         # setup formatter
         logger.info('Setting up data formatter...')
-        self.formatter = load_formatter(self.cfg.dataset_path,
-                                        self.cfg.text_keys, self.cfg.suffixes,
-                                        self.cfg.add_suffix)
+        self.formatter = load_formatter(
+            dataset_path=self.cfg.dataset_path,
+            generated_dataset_config=self.cfg.generated_dataset_config,
+            text_keys=self.cfg.text_keys,
+            suffixes=self.cfg.suffixes,
+            add_suffix=self.cfg.add_suffix)
 
         # prepare exporter and check export path suffix
         # NOTICE: no need to export dataset texts for analyzer

diff --git a/data_juicer/core/data.py b/data_juicer/core/data.py
@@ -227,12 +227,17 @@ def map(self, *args, **kargs):
                 called_func, '__wrapped__'):
             called_func = called_func.__wrapped__
 
-        # Batched is always required for fault tolerance
         if inspect.ismethod(called_func):
-            kargs['batched'] = True
-            kargs['batch_size'] = kargs.pop('batch_size', 1) if hasattr(
-                called_func.__self__, 'is_batched_op'
-            ) and called_func.__self__.is_batched_op() else 1
+            # batched is required for fault-tolerant or batched OP
+            if not called_func.__self__.turbo or hasattr(
+                    called_func.__self__,
+                    'is_batched_op') and called_func.__self__.is_batched_op():
+                kargs['batched'] = True
+                kargs['batch_size'] = kargs.pop('batch_size', 1) if hasattr(
+                    called_func.__self__, 'is_batched_op'
+                ) and called_func.__self__.is_batched_op() else 1
+            else:
+                kargs['batched'] = False
 
         if 'new_fingerprint' not in kargs or kargs['new_fingerprint'] is None:
             new_fingerprint = generate_fingerprint(self, *args, **kargs)

diff --git a/data_juicer/core/executor.py b/data_juicer/core/executor.py
@@ -48,10 +48,12 @@ def __init__(self, cfg=None):
 
         # setup formatter
         logger.info('Setting up data formatter...')
-        self.formatter = load_formatter(self.cfg.dataset_path,
-                                        self.cfg.generated_dataset_config,
-                                        self.cfg.text_keys, self.cfg.suffixes,
-                                        self.cfg.add_suffix)
+        self.formatter = load_formatter(
+            dataset_path=self.cfg.dataset_path,
+            generated_dataset_config=self.cfg.generated_dataset_config,
+            text_keys=self.cfg.text_keys,
+            suffixes=self.cfg.suffixes,
+            add_suffix=self.cfg.add_suffix)
 
         # whether to use checkpoint mechanism. If it's true, Executor will
         # check if there are existing checkpoints first and try to load the

diff --git a/data_juicer/format/formatter.py b/data_juicer/format/formatter.py
@@ -1,5 +1,5 @@
 import os
-from typing import List, Tuple, Union
+from typing import List, Union
 
 from datasets import Dataset, DatasetDict, concatenate_datasets, load_dataset
 from loguru import logger
@@ -27,7 +27,7 @@ def __init__(
         self,
         dataset_path: str,
         type: str,
-        suffixes: Union[str, List[str], Tuple[str]] = None,
+        suffixes: Union[str, List[str], None] = None,
         text_keys: List[str] = None,
         add_suffix=False,
         **kwargs,

diff --git a/data_juicer/format/mixture_formatter.py b/data_juicer/format/mixture_formatter.py
@@ -1,5 +1,5 @@
 from itertools import chain, repeat
-from typing import List, Tuple, Union
+from typing import List, Union
 
 import numpy as np
 from datasets import Dataset, concatenate_datasets
@@ -15,7 +15,7 @@ class MixtureFormatter(BaseFormatter):
 
     def __init__(self,
                  dataset_path: str,
-                 suffixes: Union[str, List[str], Tuple[str]] = None,
+                 suffixes: Union[str, List[str], None] = None,
                  text_keys=None,
                  add_suffix=False,
                  max_samples=None,

diff --git a/data_juicer/ops/base_op.py b/data_juicer/ops/base_op.py
@@ -149,6 +149,8 @@ def __init__(self, *args, **kwargs):
         if isinstance(self.mem_required, str):
             self.mem_required = size_to_bytes(self.mem_required) / 1024**3
 
+        self.turbo = kwargs.get('turbo', False)
+
         # nested wrappers
         from data_juicer.core.data import wrap_func_with_nested_access
         for name in ['process', 'compute_stats', 'compute_hash']:

diff --git a/data_juicer/ops/deduplicator/document_minhash_deduplicator.py b/data_juicer/ops/deduplicator/document_minhash_deduplicator.py
@@ -5,12 +5,14 @@
 import hashlib
 import struct
 from collections import defaultdict
+from typing import Optional
 
 import numpy as np
 import regex
-from jsonargparse.typing import ClosedUnitInterval, PositiveInt
 from loguru import logger
+from pydantic import Field, PositiveInt
 from tqdm import tqdm
+from typing_extensions import Annotated
 
 from data_juicer.utils.availability_utils import AvailabilityChecking
 from data_juicer.utils.constant import HashKeys
@@ -109,12 +111,12 @@ def __init__(
         tokenization: str = 'space',
         window_size: PositiveInt = 5,
         lowercase: bool = True,
-        ignore_pattern: str = None,
+        ignore_pattern: Optional[str] = None,
         num_permutations: PositiveInt = 256,
-        jaccard_threshold: ClosedUnitInterval = 0.7,
-        num_bands: PositiveInt = None,
-        num_rows_per_band: PositiveInt = None,
-        tokenizer_model: str = None,
+        jaccard_threshold: Annotated[float, Field(ge=0, le=1)] = 0.7,
+        num_bands: Optional[PositiveInt] = None,
+        num_rows_per_band: Optional[PositiveInt] = None,
+        tokenizer_model: Optional[str] = None,
         *args,
         **kwargs,
     ):

diff --git a/data_juicer/ops/deduplicator/document_simhash_deduplicator.py b/data_juicer/ops/deduplicator/document_simhash_deduplicator.py
@@ -3,12 +3,12 @@
 # --------------------------------------------------------
 
 from collections import defaultdict, deque
-from typing import Dict, Set
+from typing import Dict, Optional, Set
 
 import numpy as np
 import regex
-from jsonargparse.typing import PositiveInt
 from loguru import logger
+from pydantic import PositiveInt
 
 from data_juicer.utils.availability_utils import AvailabilityChecking
 from data_juicer.utils.constant import HashKeys
@@ -30,7 +30,7 @@ def __init__(self,
                  tokenization: str = 'space',
                  window_size: PositiveInt = 6,
                  lowercase: bool = True,
-                 ignore_pattern: str = None,
+                 ignore_pattern: Optional[str] = None,
                  num_blocks: PositiveInt = 6,
                  hamming_distance: PositiveInt = 4,
                  *args,

diff --git a/data_juicer/ops/deduplicator/image_deduplicator.py b/data_juicer/ops/deduplicator/image_deduplicator.py
@@ -104,7 +104,7 @@ def process(self, dataset, show_num=0):
         if show_num > 0:
             # sample duplicate pairs
             if self.consider_text:
-                hash2ids: Dict[Tuple[int], Set[int]] = defaultdict(set)
+                hash2ids: Dict[Tuple[int, int], Set[int]] = defaultdict(set)
                 hashes = zip(dataset[HashKeys.imagehash],
                              dataset[HashKeys.hash])
             else:

diff --git a/data_juicer/ops/deduplicator/ray_basic_deduplicator.py b/data_juicer/ops/deduplicator/ray_basic_deduplicator.py
@@ -1,6 +1,6 @@
 from typing import Any
 
-from jsonargparse.typing import PositiveInt
+from pydantic import PositiveInt
 
 from data_juicer.utils.availability_utils import AvailabilityChecking
 from data_juicer.utils.constant import HashKeys

diff --git a/data_juicer/ops/deduplicator/ray_document_deduplicator.py b/data_juicer/ops/deduplicator/ray_document_deduplicator.py
@@ -2,7 +2,7 @@
 import string
 
 import regex as re
-from jsonargparse.typing import PositiveInt
+from pydantic import PositiveInt
 
 from ..base_op import OPERATORS
 from .ray_basic_deduplicator import RayBasicDeduplicator

diff --git a/data_juicer/ops/deduplicator/ray_image_deduplicator.py b/data_juicer/ops/deduplicator/ray_image_deduplicator.py
@@ -1,5 +1,5 @@
 import numpy as np
-from jsonargparse.typing import PositiveInt
+from pydantic import PositiveInt
 
 from data_juicer.utils.availability_utils import AvailabilityChecking
 from data_juicer.utils.mm_utils import load_data_with_context, load_image

diff --git a/data_juicer/ops/deduplicator/ray_video_deduplicator.py b/data_juicer/ops/deduplicator/ray_video_deduplicator.py
@@ -1,6 +1,6 @@
 import hashlib
 
-from jsonargparse.typing import PositiveInt
+from pydantic import PositiveInt
 
 from data_juicer.utils.mm_utils import (close_video, load_data_with_context,
                                         load_video)