modelscope · drcege · Sep 13, 2024 · Sep 10, 2024 · Sep 10, 2024 · Sep 11, 2024
diff --git a/configs/config_all.yaml b/configs/config_all.yaml
@@ -122,6 +122,8 @@ process:
       cv_classifier: ''                                       # OpenCV classifier path for face detection. By default, we will use 'haarcascade_frontalface_alt.xml'.
       blur_type: 'gaussian'                                   # type of blur kernel, including ['mean', 'box', 'gaussian']
       radius: 2                                               # radius of blur kernel
+  - image_tagging_mapper:                                   # Mapper to generate image tags.
+      tag_field_name: '__dj__image_tags__'                    # the field name to store the tags. It's "__dj__image_tags__" in default.
   - nlpaug_en_mapper:                                       # simply augment texts in English based on the nlpaug library
       sequential: false                                       # whether combine all augmentation methods to a sequence. If it's True, a sample will be augmented by all opened augmentation methods sequentially. If it's False, each opened augmentation method would generate its augmented samples independently.
       aug_num: 1                                              # number of augmented samples to be generated. If `sequential` is True, there will be total aug_num augmented samples generated. If it's False, there will be (aug_num * #opened_aug_method) augmented samples generated.
@@ -258,10 +260,12 @@ process:
       show_progress: false                                    # whether to show progress from scenedetect
   - video_tagging_from_audio_mapper:                        # Mapper to generate video tags from audio streams extracted from the video.
       hf_ast: 'MIT/ast-finetuned-audioset-10-10-0.4593'       # Huggingface model name for the audio classification model.
+      tag_field_name: '__dj__video_audio_tags__'              # the field name to store the tags. It's "__dj__video_audio_tags__" in default.
       mem_required: '500MB'                                   # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
   - video_tagging_from_frames_mapper:                       # Mapper to generate video tags from frames extracted from the video.
       frame_sampling_method: 'all_keyframes'                  # sampling method of extracting frame images from the videos. Should be one of ["all_keyframes", "uniform"]. The former one extracts all key frames and the latter one extract specified number of frames uniformly from the video. Default: "all_keyframes".
       frame_num: 3                                            # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
+      tag_field_name: '__dj__video_frame_tags__'              # the field name to store the tags. It's "__dj__video_frame_tags__" in default.
   - whitespace_normalization_mapper:                        # normalize different kinds of whitespaces to English whitespace.
 
   # Filter ops
@@ -478,6 +482,7 @@ process:
       contain: any                                            # require the videos containing 'any' or 'all' given tags. When tags equal to [], 'all' keeps all samples, 'any' keeps no sample.
       frame_sampling_method: all_keyframes                    # sampling method of extracting frame images from the videos. Should be one of ["all_keyframes", "uniform"]. The former one extracts all key frames and the latter one extract specified number of frames uniformly from the video. Default: "all_keyframes".
       frame_num: 3                                            # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
+      tag_field_name: '__dj__video_frame_tags__'              # the field name to store the tags. It's "__dj__video_frame_tags__" in default.
       any_or_all: any                                         # keep this sample when any/all videos meet the filter condition
   - words_num_filter:                                       # filter text with number of words out of specific range
       lang: en                                                # sample in which language

diff --git a/configs/data_juicer_recipes/general-video-refine-example.yaml b/configs/data_juicer_recipes/general-video-refine-example.yaml
@@ -47,7 +47,7 @@ process:
   - video_motion_score_filter:                              # Keep samples with video motion scores within a specific range.
       min_score: 0.25                                         # the minimum motion score to keep samples
       max_score: 10000.0                                      # the maximum motion score to keep samples
-      sampling_fps: 2                                         # the samplig rate of frames_per_second to compute optical flow
+      sampling_fps: 2                                         # the sampling rate of frames_per_second to compute optical flow
       any_or_all: any                                         # keep this sample when any/all videos meet the filter condition
   - video_nsfw_filter:                                      # filter samples according to the nsfw scores of videos in them
       hf_nsfw_model: Falconsai/nsfw_image_detection           # Huggingface model name for nsfw classification

diff --git a/configs/demo/bench/model_train.yaml b/configs/demo/bench/model_train.yaml
@@ -13,7 +13,7 @@ train:
     # https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/dj-competition/modelscope_sora/models/easyanimate_mm_16x256x256_pretrain.safetensors
     transformer_path: "/PATH/TO/EASYANIMATE_MODEL"
   dataset_path:
-    # The root diretory to videos. Set empty if it is the absolute path in the dataset.
+    # The root directory to videos. Set empty if it is the absolute path in the dataset.
     dataset_name: ""
     # path to the Data-Juicer dataset. Note that the root path is in "thirdparth/models/EasyAnimate"
     dataset_meta_name: "../../../outputs/demo-bench/demo-dataset-for-train.jsonl"

diff --git a/configs/demo/bench/model_train_2_epoch.yaml b/configs/demo/bench/model_train_2_epoch.yaml
@@ -13,7 +13,7 @@ train:
     # https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/dj-competition/modelscope_sora/models/easyanimate_mm_16x256x256_pretrain.safetensors
     transformer_path: "/PATH/TO/EASYANIMATE_MODEL"
   dataset_path:
-    # The root diretory to videos. Set empty if it is the absolute path in the dataset.
+    # The root directory to videos. Set empty if it is the absolute path in the dataset.
     dataset_name: ""
     # path to the Data-Juicer dataset. Note that the root path is in "thirdparth/easy_animate"
     dataset_meta_name: "../../../outputs/demo-bench/demo-dataset-with-multi-op-stats.jsonl"

diff --git a/configs/demo/sandbox/inception_eval_config.yaml b/configs/demo/sandbox/inception_eval_config.yaml
@@ -4,9 +4,9 @@ type: video_inception_evaluator
 fake_data_path: /path/to/the/generated/dj_format_dataset
 # The path to ground truth dataset. Only support for `jsonl` format. The video paths are put in the list under `videos` keys. Required when computing FVD, FID, KID, and PR.
 real_data_path: /path/to/the/groundtruth/dj_format_dataset
-# The root diretory to store the generated videos. If it is not none, the paths in jsonl file at fake_data_path are relative paths on it, else are absolute path.
+# The root directory to store the generated videos. If it is not none, the paths in jsonl file at fake_data_path are relative paths on it, else are absolute path.
 fake_mm_dir: null
-# The root diretory to store the real videos. If it is not none, the paths in jsonl file at real_data_path are relative paths on it, else are absolute path.
+# The root directory to store the real videos. If it is not none, the paths in jsonl file at real_data_path are relative paths on it, else are absolute path.
 real_mm_dir: null
 # Path to the corresponding detection model. Download the model from web if it is None.
 detector_path: null

diff --git a/data_juicer/config/__init__.py b/data_juicer/config/__init__.py
@@ -2,7 +2,6 @@
                      merge_config, prepare_side_configs)
 
 __all__ = [
-    'init_configs',
-    'export_config',
-    'merge_config',
+    'init_configs', 'get_init_configs', 'export_config', 'merge_config',
+    'prepare_side_configs'
 ]
diff --git a/data_juicer/config/config.py b/data_juicer/config/config.py
@@ -4,12 +4,12 @@
 import shutil
 import tempfile
 import time
-from argparse import ArgumentError, Namespace
-from typing import Dict, List, Tuple, Union
+from argparse import ArgumentError
+from typing import Dict, List, Optional, Union
 
 import yaml
-from jsonargparse import (ActionConfigFile, ArgumentParser, dict_to_namespace,
-                          namespace_to_dict)
+from jsonargparse import (ActionConfigFile, ArgumentParser, Namespace,
+                          dict_to_namespace, namespace_to_dict)
 from jsonargparse.typehints import ActionTypeHint
 from jsonargparse.typing import ClosedUnitInterval, NonNegativeInt, PositiveInt
 from loguru import logger
@@ -22,7 +22,7 @@
 global_parser = None
 
 
-def init_configs(args=None):
+def init_configs(args: Optional[List[str]] = None):
     """
     initialize the jsonargparse parser and parse configs from one of:
         1. POSIX-style commands line args;
@@ -194,7 +194,7 @@ def init_configs(args=None):
         'own special token according to your input dataset.')
     parser.add_argument(
         '--suffixes',
-        type=Union[str, List[str], Tuple[str]],
+        type=Union[str, List[str]],
         default=[],
         help='Suffixes of files that will be find and loaded. If not set, we '
         'will find all suffix files, and select a suitable formatter '
@@ -658,13 +658,13 @@ def display_config(cfg):
     print(table)
 
 
-def export_config(cfg,
-                  path,
-                  format='yaml',
-                  skip_none=True,
-                  skip_check=True,
-                  overwrite=False,
-                  multifile=True):
+def export_config(cfg: Namespace,
+                  path: str,
+                  format: str = 'yaml',
+                  skip_none: bool = True,
+                  skip_check: bool = True,
+                  overwrite: bool = False,
+                  multifile: bool = True):
     """
     Save the config object, some params are from jsonargparse
 
@@ -700,7 +700,7 @@ def export_config(cfg,
     logger.info(f'Saved the configuration in {path}')
 
 
-def merge_config(ori_cfg, new_cfg: Dict):
+def merge_config(ori_cfg: Namespace, new_cfg: Namespace):
     """
     Merge configuration from new_cfg into ori_cfg
 
@@ -758,7 +758,7 @@ def merge_config(ori_cfg, new_cfg: Dict):
         logger.error('Config merge failed')
 
 
-def prepare_side_configs(ori_config):
+def prepare_side_configs(ori_config: Union[str, Namespace, Dict]):
     """
     parse the config if ori_config is a string of a config file path with
         yaml, yml or json format
@@ -790,7 +790,7 @@ def prepare_side_configs(ori_config):
     return config
 
 
-def get_init_configs(cfg):
+def get_init_configs(cfg: Namespace):
     """
     set init configs of datajucer for cfg
     """

diff --git a/data_juicer/core/analyzer.py b/data_juicer/core/analyzer.py
@@ -1,6 +1,9 @@
 import os
+from typing import Optional
 
+from jsonargparse import Namespace
 from loguru import logger
+from pydantic import PositiveInt
 
 from data_juicer.analysis import ColumnWiseAnalysis, OverallAnalysis
 from data_juicer.config import init_configs
@@ -22,11 +25,11 @@ class Analyzer:
     dataset better.
     """
 
-    def __init__(self, cfg=None):
+    def __init__(self, cfg: Optional[Namespace] = None):
         """
         Initialization method.
 
-        :param cfg: optional config dict.
+        :param cfg: optional jsonargparse Namespace dict.
         """
         self.cfg = init_configs() if cfg is None else cfg
 
@@ -65,7 +68,9 @@ def __init__(self, cfg=None):
         self.overall_single_plot_path = None
         self.analysis_path = os.path.join(self.cfg.work_dir, 'analysis')
 
-    def run(self, load_data_np=None, skip_export=False):
+    def run(self,
+            load_data_np: Optional[PositiveInt] = None,
+            skip_export: bool = False):
         """
         Running the dataset analysis pipeline.
 

diff --git a/data_juicer/core/executor.py b/data_juicer/core/executor.py
@@ -1,7 +1,10 @@
 import os
 from time import time
+from typing import Optional
 
+from jsonargparse import Namespace
 from loguru import logger
+from pydantic import PositiveInt
 
 from data_juicer.config import init_configs
 from data_juicer.core.data import Dataset
@@ -27,11 +30,11 @@ class Executor:
     ops in the config file in order and generate a processed dataset.
     """
 
-    def __init__(self, cfg=None):
+    def __init__(self, cfg: Optional[Namespace] = None):
         """
         Initialization method.
 
-        :param cfg: optional config dict.
+        :param cfg: optional jsonargparse Namespace.
         """
         self.cfg = init_configs() if cfg is None else cfg
 
@@ -135,7 +138,7 @@ def sample_data(self,
         else:
             raise ValueError(f'Unsupported sample_algo: {sample_algo}')
 
-    def run(self, load_data_np=None):
+    def run(self, load_data_np: Optional[PositiveInt] = None):
         """
         Running the dataset process pipeline.
 
@@ -175,4 +178,5 @@ def run(self, load_data_np=None):
         if self.cfg.use_cache and self.cfg.cache_compress:
             from data_juicer.utils.compress import compress
             compress(dataset)
+
         return dataset
diff --git a/data_juicer/format/formatter.py b/data_juicer/format/formatter.py
@@ -1,5 +1,5 @@
 import os
-from typing import List, Tuple, Union
+from typing import List, Union
 
 from datasets import Dataset, DatasetDict, concatenate_datasets, load_dataset
 from loguru import logger
@@ -27,7 +27,7 @@ def __init__(
         self,
         dataset_path: str,
         type: str,
-        suffixes: Union[str, List[str], Tuple[str]] = None,
+        suffixes: Union[str, List[str], None] = None,
         text_keys: List[str] = None,
         add_suffix=False,
         **kwargs,

diff --git a/data_juicer/format/mixture_formatter.py b/data_juicer/format/mixture_formatter.py
@@ -1,5 +1,5 @@
 from itertools import chain, repeat
-from typing import List, Tuple, Union
+from typing import List, Union
 
 import numpy as np
 from datasets import Dataset, concatenate_datasets
@@ -15,7 +15,7 @@ class MixtureFormatter(BaseFormatter):
 
     def __init__(self,
                  dataset_path: str,
-                 suffixes: Union[str, List[str], Tuple[str]] = None,
+                 suffixes: Union[str, List[str], None] = None,
                  text_keys=None,
                  add_suffix=False,
                  max_samples=None,

diff --git a/data_juicer/ops/deduplicator/document_minhash_deduplicator.py b/data_juicer/ops/deduplicator/document_minhash_deduplicator.py
@@ -5,12 +5,14 @@
 import hashlib
 import struct
 from collections import defaultdict
+from typing import Optional
 
 import numpy as np
 import regex
-from jsonargparse.typing import ClosedUnitInterval, PositiveInt
 from loguru import logger
+from pydantic import Field, PositiveInt
 from tqdm import tqdm
+from typing_extensions import Annotated
 
 from data_juicer.utils.availability_utils import AvailabilityChecking
 from data_juicer.utils.constant import HashKeys
@@ -109,12 +111,12 @@ def __init__(
         tokenization: str = 'space',
         window_size: PositiveInt = 5,
         lowercase: bool = True,
-        ignore_pattern: str = None,
+        ignore_pattern: Optional[str] = None,
         num_permutations: PositiveInt = 256,
-        jaccard_threshold: ClosedUnitInterval = 0.7,
-        num_bands: PositiveInt = None,
-        num_rows_per_band: PositiveInt = None,
-        tokenizer_model: str = None,
+        jaccard_threshold: Annotated[float, Field(ge=0, le=1)] = 0.7,
+        num_bands: Optional[PositiveInt] = None,
+        num_rows_per_band: Optional[PositiveInt] = None,
+        tokenizer_model: Optional[str] = None,
         *args,
         **kwargs,
     ):

diff --git a/data_juicer/ops/deduplicator/document_simhash_deduplicator.py b/data_juicer/ops/deduplicator/document_simhash_deduplicator.py
@@ -3,12 +3,12 @@
 # --------------------------------------------------------
 
 from collections import defaultdict, deque
-from typing import Dict, Set
+from typing import Dict, Optional, Set
 
 import numpy as np
 import regex
-from jsonargparse.typing import PositiveInt
 from loguru import logger
+from pydantic import PositiveInt
 
 from data_juicer.utils.availability_utils import AvailabilityChecking
 from data_juicer.utils.constant import HashKeys
@@ -30,7 +30,7 @@ def __init__(self,
                  tokenization: str = 'space',
                  window_size: PositiveInt = 6,
                  lowercase: bool = True,
-                 ignore_pattern: str = None,
+                 ignore_pattern: Optional[str] = None,
                  num_blocks: PositiveInt = 6,
                  hamming_distance: PositiveInt = 4,
                  *args,

diff --git a/data_juicer/ops/deduplicator/image_deduplicator.py b/data_juicer/ops/deduplicator/image_deduplicator.py
@@ -104,7 +104,7 @@ def process(self, dataset, show_num=0):
         if show_num > 0:
             # sample duplicate pairs
             if self.consider_text:
-                hash2ids: Dict[Tuple[int], Set[int]] = defaultdict(set)
+                hash2ids: Dict[Tuple[int, int], Set[int]] = defaultdict(set)
                 hashes = zip(dataset[HashKeys.imagehash],
                              dataset[HashKeys.hash])
             else:

diff --git a/data_juicer/ops/deduplicator/ray_basic_deduplicator.py b/data_juicer/ops/deduplicator/ray_basic_deduplicator.py
@@ -1,6 +1,6 @@
 from typing import Any
 
-from jsonargparse.typing import PositiveInt
+from pydantic import PositiveInt
 
 from data_juicer.utils.availability_utils import AvailabilityChecking
 from data_juicer.utils.constant import HashKeys

diff --git a/data_juicer/ops/deduplicator/ray_document_deduplicator.py b/data_juicer/ops/deduplicator/ray_document_deduplicator.py
@@ -2,7 +2,7 @@
 import string
 
 import regex as re
-from jsonargparse.typing import PositiveInt
+from pydantic import PositiveInt
 
 from ..base_op import OPERATORS
 from .ray_basic_deduplicator import RayBasicDeduplicator

diff --git a/data_juicer/ops/deduplicator/ray_image_deduplicator.py b/data_juicer/ops/deduplicator/ray_image_deduplicator.py
@@ -1,5 +1,5 @@
 import numpy as np
-from jsonargparse.typing import PositiveInt
+from pydantic import PositiveInt
 
 from data_juicer.utils.availability_utils import AvailabilityChecking
 from data_juicer.utils.mm_utils import load_data_with_context, load_image

diff --git a/data_juicer/ops/deduplicator/ray_video_deduplicator.py b/data_juicer/ops/deduplicator/ray_video_deduplicator.py
@@ -1,6 +1,6 @@
 import hashlib
 
-from jsonargparse.typing import PositiveInt
+from pydantic import PositiveInt
 
 from data_juicer.utils.mm_utils import (close_video, load_data_with_context,
                                         load_video)

diff --git a/data_juicer/ops/deduplicator/video_deduplicator.py b/data_juicer/ops/deduplicator/video_deduplicator.py
@@ -85,7 +85,7 @@ def process(self, dataset, show_num=0):
         if show_num > 0:
             # sample duplicate pairs
             if self.consider_text:
-                hash2ids: Dict[Tuple[int], Set[int]] = defaultdict(set)
+                hash2ids: Dict[Tuple[int, int], Set[int]] = defaultdict(set)
                 hashes = zip(dataset[HashKeys.videohash],
                              dataset[HashKeys.hash])
             else:

diff --git a/data_juicer/ops/filter/alphanumeric_filter.py b/data_juicer/ops/filter/alphanumeric_filter.py
@@ -1,7 +1,5 @@
 import sys
 
-from jsonargparse.typing import PositiveFloat
-
 from data_juicer.utils.availability_utils import AvailabilityChecking
 from data_juicer.utils.constant import Fields, StatsKeys
 from data_juicer.utils.model_utils import get_model, prepare_model
@@ -23,7 +21,7 @@ class AlphanumericFilter(Filter):
     def __init__(self,
                  tokenization: bool = False,
                  min_ratio: float = 0.25,
-                 max_ratio: PositiveFloat = sys.maxsize,
+                 max_ratio: float = sys.maxsize,
                  *args,
                  **kwargs):
         """