Skip to content

Commit

Permalink
merge main
Browse files Browse the repository at this point in the history
  • Loading branch information
BeachWang committed Sep 19, 2024
2 parents 3c01fb9 + c40a308 commit faa03b8
Show file tree
Hide file tree
Showing 123 changed files with 1,166 additions and 558 deletions.
10 changes: 10 additions & 0 deletions configs/config_all.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,8 @@ process:
cv_classifier: '' # OpenCV classifier path for face detection. By default, we will use 'haarcascade_frontalface_alt.xml'.
blur_type: 'gaussian' # type of blur kernel, including ['mean', 'box', 'gaussian']
radius: 2 # radius of blur kernel
- image_tagging_mapper: # Mapper to generate image tags.
tag_field_name: '__dj__image_tags__' # the field name to store the tags. It's "__dj__image_tags__" in default.
- nlpaug_en_mapper: # simply augment texts in English based on the nlpaug library
sequential: false # whether combine all augmentation methods to a sequence. If it's True, a sample will be augmented by all opened augmentation methods sequentially. If it's False, each opened augmentation method would generate its augmented samples independently.
aug_num: 1 # number of augmented samples to be generated. If `sequential` is True, there will be total aug_num augmented samples generated. If it's False, there will be (aug_num * #opened_aug_method) augmented samples generated.
Expand Down Expand Up @@ -258,10 +260,12 @@ process:
show_progress: false # whether to show progress from scenedetect
- video_tagging_from_audio_mapper: # Mapper to generate video tags from audio streams extracted from the video.
hf_ast: 'MIT/ast-finetuned-audioset-10-10-0.4593' # Huggingface model name for the audio classification model.
tag_field_name: '__dj__video_audio_tags__' # the field name to store the tags. It's "__dj__video_audio_tags__" in default.
mem_required: '500MB' # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
- video_tagging_from_frames_mapper: # Mapper to generate video tags from frames extracted from the video.
frame_sampling_method: 'all_keyframes' # sampling method of extracting frame images from the videos. Should be one of ["all_keyframes", "uniform"]. The former one extracts all key frames and the latter one extract specified number of frames uniformly from the video. Default: "all_keyframes".
frame_num: 3 # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
tag_field_name: '__dj__video_frame_tags__' # the field name to store the tags. It's "__dj__video_frame_tags__" in default.
- whitespace_normalization_mapper: # normalize different kinds of whitespaces to English whitespace.

# Filter ops
Expand Down Expand Up @@ -316,6 +320,11 @@ process:
score_threshold: 0.5 # the nsfw score threshold for samples, range from 0 to 1. Samples with nsfw score less than this threshold will be kept.
any_or_all: any # keep this sample when any/all images meet the filter condition
mem_required: '1GB' # This operation (Op) utilizes deep neural network models that consume a significant amount of memory for computation, hence the system's available memory might constrains the maximum number of processes that can be launched
- image_pair_similarity_filter: # filter samples according to the similarity score between the image pair.
hf_clip: 'openai/clip-vit-base-patch32' # model name of the CLIP model on huggingface
min_score: 0.1 # the min similarity score of filter range
max_score: 1.0 # the max similarity score of filter range
any_or_all: "any" # keep this sample when any/all images meet the filter condition
- image_shape_filter: # filter samples according to the widths and heights of images in them
min_width: 200 # the min width of width filter range
max_width: 5000 # the max width of width filter range
Expand Down Expand Up @@ -473,6 +482,7 @@ process:
contain: any # require the videos containing 'any' or 'all' given tags. When tags equal to [], 'all' keeps all samples, 'any' keeps no sample.
frame_sampling_method: all_keyframes # sampling method of extracting frame images from the videos. Should be one of ["all_keyframes", "uniform"]. The former one extracts all key frames and the latter one extract specified number of frames uniformly from the video. Default: "all_keyframes".
frame_num: 3 # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
tag_field_name: '__dj__video_frame_tags__' # the field name to store the tags. It's "__dj__video_frame_tags__" in default.
any_or_all: any # keep this sample when any/all videos meet the filter condition
- words_num_filter: # filter text with number of words out of specific range
lang: en # sample in which language
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ process:
- video_motion_score_filter: # Keep samples with video motion scores within a specific range.
min_score: 0.25 # the minimum motion score to keep samples
max_score: 10000.0 # the maximum motion score to keep samples
sampling_fps: 2 # the samplig rate of frames_per_second to compute optical flow
sampling_fps: 2 # the sampling rate of frames_per_second to compute optical flow
any_or_all: any # keep this sample when any/all videos meet the filter condition
- video_nsfw_filter: # filter samples according to the nsfw scores of videos in them
hf_nsfw_model: Falconsai/nsfw_image_detection # Huggingface model name for nsfw classification
Expand Down
2 changes: 1 addition & 1 deletion configs/demo/bench/model_train.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ train:
# https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/dj-competition/modelscope_sora/models/easyanimate_mm_16x256x256_pretrain.safetensors
transformer_path: "/PATH/TO/EASYANIMATE_MODEL"
dataset_path:
# The root diretory to videos. Set empty if it is the absolute path in the dataset.
# The root directory to videos. Set empty if it is the absolute path in the dataset.
dataset_name: ""
# path to the Data-Juicer dataset. Note that the root path is in "thirdparth/models/EasyAnimate"
dataset_meta_name: "../../../outputs/demo-bench/demo-dataset-for-train.jsonl"
Expand Down
2 changes: 1 addition & 1 deletion configs/demo/bench/model_train_2_epoch.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ train:
# https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/dj-competition/modelscope_sora/models/easyanimate_mm_16x256x256_pretrain.safetensors
transformer_path: "/PATH/TO/EASYANIMATE_MODEL"
dataset_path:
# The root diretory to videos. Set empty if it is the absolute path in the dataset.
# The root directory to videos. Set empty if it is the absolute path in the dataset.
dataset_name: ""
# path to the Data-Juicer dataset. Note that the root path is in "thirdparth/easy_animate"
dataset_meta_name: "../../../outputs/demo-bench/demo-dataset-with-multi-op-stats.jsonl"
Expand Down
4 changes: 2 additions & 2 deletions configs/demo/sandbox/inception_eval_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@ type: video_inception_evaluator
fake_data_path: /path/to/the/generated/dj_format_dataset
# The path to ground truth dataset. Only support for `jsonl` format. The video paths are put in the list under `videos` keys. Required when computing FVD, FID, KID, and PR.
real_data_path: /path/to/the/groundtruth/dj_format_dataset
# The root diretory to store the generated videos. If it is not none, the paths in jsonl file at fake_data_path are relative paths on it, else are absolute path.
# The root directory to store the generated videos. If it is not none, the paths in jsonl file at fake_data_path are relative paths on it, else are absolute path.
fake_mm_dir: null
# The root diretory to store the real videos. If it is not none, the paths in jsonl file at real_data_path are relative paths on it, else are absolute path.
# The root directory to store the real videos. If it is not none, the paths in jsonl file at real_data_path are relative paths on it, else are absolute path.
real_mm_dir: null
# Path to the corresponding detection model. Download the model from web if it is None.
detector_path: null
Expand Down
30 changes: 20 additions & 10 deletions data_juicer/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import tempfile
import time
from argparse import ArgumentError, Namespace
from typing import Dict, List, Tuple, Union
from typing import Dict, List, Union

import yaml
from jsonargparse import (ActionConfigFile, ArgumentParser, dict_to_namespace,
Expand Down Expand Up @@ -194,11 +194,17 @@ def init_configs(args=None):
'own special token according to your input dataset.')
parser.add_argument(
'--suffixes',
type=Union[str, List[str], Tuple[str]],
type=Union[str, List[str]],
default=[],
help='Suffixes of files that will be find and loaded. If not set, we '
'will find all suffix files, and select a suitable formatter '
'with the most files as default.')
parser.add_argument(
'--turbo',
type=bool,
default=False,
help='Enable Turbo mode to maximize processing speed. Stability '
'features like fault tolerance will be disabled.')
parser.add_argument(
'--use_cache',
type=bool,
Expand Down Expand Up @@ -470,6 +476,8 @@ def init_setup_from_cfg(cfg):
'image_key': cfg.image_key,
'audio_key': cfg.audio_key,
'video_key': cfg.video_key,
'num_proc': cfg.np,
'turbo': cfg.turbo,
}
else:
if 'text_key' not in args or args['text_key'] is None:
Expand All @@ -480,6 +488,10 @@ def init_setup_from_cfg(cfg):
args['audio_key'] = cfg.audio_key
if 'video_key' not in args or args['video_key'] is None:
args['video_key'] = cfg.video_key
if 'num_proc' not in args or args['num_proc'] is None:
args['num_proc'] = cfg.np
if 'turbo' not in args or args['turbo'] is None:
args['turbo'] = cfg.turbo
op[op_name] = args

return cfg
Expand Down Expand Up @@ -574,14 +586,12 @@ def update_op_process(cfg, parser):

# update op params of cfg.process
internal_op_para = temp_cfg.get(op_in_process_name)
if internal_op_para is not None:
num_proc = internal_op_para.get('num_proc')
if 'num_proc' in internal_op_para:
internal_op_para['num_proc'] = num_proc or cfg.np
internal_op_para = namespace_to_dict(internal_op_para)
else:
internal_op_para = None
cfg.process[i] = {op_in_process_name: internal_op_para}

cfg.process[i] = {
op_in_process_name:
None if internal_op_para is None else
namespace_to_dict(internal_op_para)
}

# check the op params via type hint
temp_parser = copy.deepcopy(parser)
Expand Down
9 changes: 6 additions & 3 deletions data_juicer/core/analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,12 @@ def __init__(self, cfg=None):

# setup formatter
logger.info('Setting up data formatter...')
self.formatter = load_formatter(self.cfg.dataset_path,
self.cfg.text_keys, self.cfg.suffixes,
self.cfg.add_suffix)
self.formatter = load_formatter(
dataset_path=self.cfg.dataset_path,
generated_dataset_config=self.cfg.generated_dataset_config,
text_keys=self.cfg.text_keys,
suffixes=self.cfg.suffixes,
add_suffix=self.cfg.add_suffix)

# prepare exporter and check export path suffix
# NOTICE: no need to export dataset texts for analyzer
Expand Down
15 changes: 10 additions & 5 deletions data_juicer/core/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,12 +227,17 @@ def map(self, *args, **kargs):
called_func, '__wrapped__'):
called_func = called_func.__wrapped__

# Batched is always required for fault tolerance
if inspect.ismethod(called_func):
kargs['batched'] = True
kargs['batch_size'] = kargs.pop('batch_size', 1) if hasattr(
called_func.__self__, 'is_batched_op'
) and called_func.__self__.is_batched_op() else 1
# batched is required for fault-tolerant or batched OP
if not called_func.__self__.turbo or hasattr(
called_func.__self__,
'is_batched_op') and called_func.__self__.is_batched_op():
kargs['batched'] = True
kargs['batch_size'] = kargs.pop('batch_size', 1) if hasattr(
called_func.__self__, 'is_batched_op'
) and called_func.__self__.is_batched_op() else 1
else:
kargs['batched'] = False

if 'new_fingerprint' not in kargs or kargs['new_fingerprint'] is None:
new_fingerprint = generate_fingerprint(self, *args, **kargs)
Expand Down
10 changes: 6 additions & 4 deletions data_juicer/core/executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,12 @@ def __init__(self, cfg=None):

# setup formatter
logger.info('Setting up data formatter...')
self.formatter = load_formatter(self.cfg.dataset_path,
self.cfg.generated_dataset_config,
self.cfg.text_keys, self.cfg.suffixes,
self.cfg.add_suffix)
self.formatter = load_formatter(
dataset_path=self.cfg.dataset_path,
generated_dataset_config=self.cfg.generated_dataset_config,
text_keys=self.cfg.text_keys,
suffixes=self.cfg.suffixes,
add_suffix=self.cfg.add_suffix)

# whether to use checkpoint mechanism. If it's true, Executor will
# check if there are existing checkpoints first and try to load the
Expand Down
4 changes: 2 additions & 2 deletions data_juicer/format/formatter.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import os
from typing import List, Tuple, Union
from typing import List, Union

from datasets import Dataset, DatasetDict, concatenate_datasets, load_dataset
from loguru import logger
Expand Down Expand Up @@ -27,7 +27,7 @@ def __init__(
self,
dataset_path: str,
type: str,
suffixes: Union[str, List[str], Tuple[str]] = None,
suffixes: Union[str, List[str], None] = None,
text_keys: List[str] = None,
add_suffix=False,
**kwargs,
Expand Down
4 changes: 2 additions & 2 deletions data_juicer/format/mixture_formatter.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from itertools import chain, repeat
from typing import List, Tuple, Union
from typing import List, Union

import numpy as np
from datasets import Dataset, concatenate_datasets
Expand All @@ -15,7 +15,7 @@ class MixtureFormatter(BaseFormatter):

def __init__(self,
dataset_path: str,
suffixes: Union[str, List[str], Tuple[str]] = None,
suffixes: Union[str, List[str], None] = None,
text_keys=None,
add_suffix=False,
max_samples=None,
Expand Down
2 changes: 2 additions & 0 deletions data_juicer/ops/base_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,8 @@ def __init__(self, *args, **kwargs):
if isinstance(self.mem_required, str):
self.mem_required = size_to_bytes(self.mem_required) / 1024**3

self.turbo = kwargs.get('turbo', False)

# nested wrappers
from data_juicer.core.data import wrap_func_with_nested_access
for name in ['process', 'compute_stats', 'compute_hash']:
Expand Down
14 changes: 8 additions & 6 deletions data_juicer/ops/deduplicator/document_minhash_deduplicator.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,14 @@
import hashlib
import struct
from collections import defaultdict
from typing import Optional

import numpy as np
import regex
from jsonargparse.typing import ClosedUnitInterval, PositiveInt
from loguru import logger
from pydantic import Field, PositiveInt
from tqdm import tqdm
from typing_extensions import Annotated

from data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.constant import HashKeys
Expand Down Expand Up @@ -109,12 +111,12 @@ def __init__(
tokenization: str = 'space',
window_size: PositiveInt = 5,
lowercase: bool = True,
ignore_pattern: str = None,
ignore_pattern: Optional[str] = None,
num_permutations: PositiveInt = 256,
jaccard_threshold: ClosedUnitInterval = 0.7,
num_bands: PositiveInt = None,
num_rows_per_band: PositiveInt = None,
tokenizer_model: str = None,
jaccard_threshold: Annotated[float, Field(ge=0, le=1)] = 0.7,
num_bands: Optional[PositiveInt] = None,
num_rows_per_band: Optional[PositiveInt] = None,
tokenizer_model: Optional[str] = None,
*args,
**kwargs,
):
Expand Down
6 changes: 3 additions & 3 deletions data_juicer/ops/deduplicator/document_simhash_deduplicator.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@
# --------------------------------------------------------

from collections import defaultdict, deque
from typing import Dict, Set
from typing import Dict, Optional, Set

import numpy as np
import regex
from jsonargparse.typing import PositiveInt
from loguru import logger
from pydantic import PositiveInt

from data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.constant import HashKeys
Expand All @@ -30,7 +30,7 @@ def __init__(self,
tokenization: str = 'space',
window_size: PositiveInt = 6,
lowercase: bool = True,
ignore_pattern: str = None,
ignore_pattern: Optional[str] = None,
num_blocks: PositiveInt = 6,
hamming_distance: PositiveInt = 4,
*args,
Expand Down
2 changes: 1 addition & 1 deletion data_juicer/ops/deduplicator/image_deduplicator.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def process(self, dataset, show_num=0):
if show_num > 0:
# sample duplicate pairs
if self.consider_text:
hash2ids: Dict[Tuple[int], Set[int]] = defaultdict(set)
hash2ids: Dict[Tuple[int, int], Set[int]] = defaultdict(set)
hashes = zip(dataset[HashKeys.imagehash],
dataset[HashKeys.hash])
else:
Expand Down
2 changes: 1 addition & 1 deletion data_juicer/ops/deduplicator/ray_basic_deduplicator.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from typing import Any

from jsonargparse.typing import PositiveInt
from pydantic import PositiveInt

from data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.constant import HashKeys
Expand Down
2 changes: 1 addition & 1 deletion data_juicer/ops/deduplicator/ray_document_deduplicator.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import string

import regex as re
from jsonargparse.typing import PositiveInt
from pydantic import PositiveInt

from ..base_op import OPERATORS
from .ray_basic_deduplicator import RayBasicDeduplicator
Expand Down
2 changes: 1 addition & 1 deletion data_juicer/ops/deduplicator/ray_image_deduplicator.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import numpy as np
from jsonargparse.typing import PositiveInt
from pydantic import PositiveInt

from data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.mm_utils import load_data_with_context, load_image
Expand Down
2 changes: 1 addition & 1 deletion data_juicer/ops/deduplicator/ray_video_deduplicator.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import hashlib

from jsonargparse.typing import PositiveInt
from pydantic import PositiveInt

from data_juicer.utils.mm_utils import (close_video, load_data_with_context,
load_video)
Expand Down
Loading

0 comments on commit faa03b8

Please sign in to comment.