Skip to content

Commit

Permalink
Fix (#427)
Browse files Browse the repository at this point in the history
Fix some words
  • Loading branch information
co63oc authored Sep 12, 2024
1 parent b3fb942 commit 762805b
Show file tree
Hide file tree
Showing 25 changed files with 93 additions and 93 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ process:
- video_motion_score_filter: # Keep samples with video motion scores within a specific range.
min_score: 0.25 # the minimum motion score to keep samples
max_score: 10000.0 # the maximum motion score to keep samples
sampling_fps: 2 # the samplig rate of frames_per_second to compute optical flow
sampling_fps: 2 # the sampling rate of frames_per_second to compute optical flow
any_or_all: any # keep this sample when any/all videos meet the filter condition
- video_nsfw_filter: # filter samples according to the nsfw scores of videos in them
hf_nsfw_model: Falconsai/nsfw_image_detection # Huggingface model name for nsfw classification
Expand Down
2 changes: 1 addition & 1 deletion configs/demo/bench/model_train.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ train:
# https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/dj-competition/modelscope_sora/models/easyanimate_mm_16x256x256_pretrain.safetensors
transformer_path: "/PATH/TO/EASYANIMATE_MODEL"
dataset_path:
# The root diretory to videos. Set empty if it is the absolute path in the dataset.
# The root directory to videos. Set empty if it is the absolute path in the dataset.
dataset_name: ""
# path to the Data-Juicer dataset. Note that the root path is in "thirdparth/models/EasyAnimate"
dataset_meta_name: "../../../outputs/demo-bench/demo-dataset-for-train.jsonl"
Expand Down
2 changes: 1 addition & 1 deletion configs/demo/bench/model_train_2_epoch.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ train:
# https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/dj-competition/modelscope_sora/models/easyanimate_mm_16x256x256_pretrain.safetensors
transformer_path: "/PATH/TO/EASYANIMATE_MODEL"
dataset_path:
# The root diretory to videos. Set empty if it is the absolute path in the dataset.
# The root directory to videos. Set empty if it is the absolute path in the dataset.
dataset_name: ""
# path to the Data-Juicer dataset. Note that the root path is in "thirdparth/easy_animate"
dataset_meta_name: "../../../outputs/demo-bench/demo-dataset-with-multi-op-stats.jsonl"
Expand Down
4 changes: 2 additions & 2 deletions configs/demo/sandbox/inception_eval_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@ type: video_inception_evaluator
fake_data_path: /path/to/the/generated/dj_format_dataset
# The path to ground truth dataset. Only support for `jsonl` format. The video paths are put in the list under `videos` keys. Required when computing FVD, FID, KID, and PR.
real_data_path: /path/to/the/groundtruth/dj_format_dataset
# The root diretory to store the generated videos. If it is not none, the paths in jsonl file at fake_data_path are relative paths on it, else are absolute path.
# The root directory to store the generated videos. If it is not none, the paths in jsonl file at fake_data_path are relative paths on it, else are absolute path.
fake_mm_dir: null
# The root diretory to store the real videos. If it is not none, the paths in jsonl file at real_data_path are relative paths on it, else are absolute path.
# The root directory to store the real videos. If it is not none, the paths in jsonl file at real_data_path are relative paths on it, else are absolute path.
real_mm_dir: null
# Path to the corresponding detection model. Download the model from web if it is None.
detector_path: null
Expand Down
8 changes: 4 additions & 4 deletions data_juicer/utils/compress.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,7 @@ def _get_compressed_filename(self, filename: Union[Path, str]):
"""
return str(filename) + self.compressor_extension

def _get_cache_diretory(self, ds):
def _get_cache_directory(self, ds):
"""
Get dataset cache directory.
:param ds: input dataset.
Expand Down Expand Up @@ -324,7 +324,7 @@ def compress(self,
dataset should be compressed.
:param num_proc: number of processes to compress cache files.
"""
# remove cache files from the list of cahce files to be compressed
# remove cache files from the list of cache files to be compressed
prev_cache_names = [item['filename'] for item in prev_ds.cache_files]
this_cache_names = [item['filename'] for item in this_ds.cache_files] \
if this_ds else []
Expand Down Expand Up @@ -389,7 +389,7 @@ def decompress(self,
`cache-` and ends with compression format.
:param num_proc: number of processes to decompress cache files.
"""
cache_directory = self._get_cache_diretory(ds)
cache_directory = self._get_cache_directory(ds)
if cache_directory is None:
return

Expand Down Expand Up @@ -448,7 +448,7 @@ def cleanup_cache_files(self, ds):
which starts with `cache-` and ends with compression format
:param ds: input dataset.
"""
cache_directory = self._get_cache_diretory(ds)
cache_directory = self._get_cache_directory(ds)
if cache_directory is None:
return
f_names = self._get_cache_file_names(
Expand Down
2 changes: 1 addition & 1 deletion data_juicer/utils/constant.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class Fields(object):
# the name of the original file from which this sample was derived.
source_file = DEFAULT_PREFIX + 'source_file__'

# the name of diretory to store the produced multimodal data
# the name of directory to store the produced multimodal data
multimodal_data_output_dir = DEFAULT_PREFIX + 'produced_data__'


Expand Down
2 changes: 1 addition & 1 deletion data_juicer/utils/file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ def add_hash_value(text, new_hash_value):
def copy_data(from_dir, to_dir, data_path):
"""
Copy data from from_dir/data_path to to_dir/data_path.
Return Ture if success.
Return True if success.
"""
from_path = os.path.join(from_dir, data_path)
to_path = os.path.join(to_dir, data_path)
Expand Down
2 changes: 1 addition & 1 deletion data_juicer/utils/fingerprint_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@


class Hasher:
"""Hasher that accepts python objets as inputs."""
"""Hasher that accepts python objects as inputs."""

dispatch: Dict = {}

Expand Down
4 changes: 2 additions & 2 deletions tools/mm_eval/inception_metrics/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ python tools/video_metrics/calc_metrics_for_dataset.py --help

- `fake_data_path`: The path to generated dataset. Only support for `jsonl` format. The video paths are put in the list under `videos` keys.
- `real_data_path`: The path to ground truth dataset. Only support for `jsonl` format. The video paths are put in the list under `videos` keys. Required when computing FVD, FID, KID, and PR.
- `fake_mm_dir`: The root diretory to store the fake videos. If it is not none, the paths in jsonl file at fake_data_path are relative paths on it, else are absolute path.
- `real_mm_dir`: The root diretory to store the real videos. If it is not none, the paths in jsonl file at real_data_path are relative paths on it, else are absolute path.
- `fake_mm_dir`: The root directory to store the fake videos. If it is not none, the paths in jsonl file at fake_data_path are relative paths on it, else are absolute path.
- `real_mm_dir`: The root directory to store the real videos. If it is not none, the paths in jsonl file at real_data_path are relative paths on it, else are absolute path.
- `metric`: The name of metric applied, currently support `fvd2048_16f`, `fvd2048_128f`, `fvd2048_128f_subsample8f`, `kvd2048_16f`, `isv2048_ucf`, `prv2048_3n_16f`, `fid50k`, `kid50k`, `is50k`, `pr50k_3n`.
- `fvd2048_16f`: Compute Frechet Video Distance (FVD), sample 2048 times in dataset, 16 adjacent frames each time.
- `fvd2048_128f`: Compute Frechet Video Distance (FVD), sample 2048 times in dataset, 128 adjacent frames each time.
Expand Down
4 changes: 2 additions & 2 deletions tools/mm_eval/inception_metrics/calc_metrics_for_videos.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,10 @@ def calc_metrics(
:param real_data_path: The path to ground truth dataset.
Only support for `jsonl` format. The video paths are put
in the list under `videos` keys. Required when computing FVD.
:param fake_mm_dir: The root diretory to store the fake videos.
:param fake_mm_dir: The root directory to store the fake videos.
If it is not none, the paths in jsonl file at fake_data_path
are relative paths on it, else are absolute path.
:param real_mm_dir: The root diretory to store the real videos.
:param real_mm_dir: The root directory to store the real videos.
If it is not none, the paths in jsonl file at real_data_path
are relative paths on it, else are absolute path.
:param metric: Metric to compute, can be one of
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def main(
keep_only_first_image: bool = True,
eoc_special_token: str = SpecialTokens.eoc,
image_special_token: str = '<image>',
sent_seperator: str = '\n',
sent_separator: str = '\n',
restore_questions: bool = False,
original_llava_ds_path: str = None,
):
Expand All @@ -95,7 +95,7 @@ def main(
this token always be "<image>". You can change it to align with your
own LLaVA-like datasets but should be careful of possible compatibility
problems that come from this change. Default: <image>.
:param sent_seperator: seperator to split different sentences. Default: \n.
:param sent_separator: separator to split different sentences. Default: \n.
:param restore_questions: need to restore human questions if only keep
caption when converting the LLaVA-like dataset to Data-Juicer-format.
If it's True, an extra argument original_llava_ds_path is required.
Expand Down Expand Up @@ -163,9 +163,9 @@ def main(
def clean_sentence(sentence, round):
sentence = sentence.strip()

# remove sentence seperator
if sentence.endswith(sent_seperator):
sentence = sentence[:-len(sent_seperator)].strip()
# remove sentence separator
if sentence.endswith(sent_separator):
sentence = sentence[:-len(sent_separator)].strip()
# remove possible eoc_special_tokens
if sentence.endswith(eoc_special_token):
sentence = sentence[:-len(eoc_special_token)].strip()
Expand All @@ -174,12 +174,12 @@ def clean_sentence(sentence, round):
if round > 0 and keep_only_first_image:
if sentence.startswith(image_special_token):
sentence = sentence[len(image_special_token):].strip()
if sentence.startswith(sent_seperator):
sentence = sentence[len(sent_seperator):].strip()
if sentence.startswith(sent_separator):
sentence = sentence[len(sent_separator):].strip()
if sentence.endswith(image_special_token):
sentence = sentence[:-len(image_special_token)].strip()
if sentence.endswith(sent_seperator):
sentence = sentence[:-len(sent_seperator)].strip()
if sentence.endswith(sent_separator):
sentence = sentence[:-len(sent_separator)].strip()
return sentence

conversations = []
Expand Down
22 changes: 11 additions & 11 deletions tools/multimodal/data_juicer_format_to_target_format/dj_to_mmc4.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ def main(
target_mmc4_ds_path: str,
eoc_special_token: str = SpecialTokens.eoc,
image_special_token: str = SpecialTokens.image,
sent_seperator: str = ' ',
sent_separator: str = ' ',
keep_dj_fields: bool = False,
):
"""
Expand All @@ -121,7 +121,7 @@ def main(
this special token is not specified. So we simply use the default image
special token from our Data-Juicer. Default: <__dj__image> (from
Data-Juicer).
:param sent_seperator: seperator to split different sentences. Default: " "
:param sent_separator: separator to split different sentences. Default: " "
:param keep_dj_fields: whether to keep intermediate fields from
Data-Juicer, such as "images", "text", ... Default: False.
"""
Expand Down Expand Up @@ -201,25 +201,25 @@ def main(
sentences = []
curr_image_idx = 0
for text_idx, sent in enumerate(chunks):
# remove possible sentence seperator
if sent.endswith(sent_seperator):
sent = sent[:-len(sent_seperator)].strip()
if sent.startswith(sent_seperator):
sent = sent[len(sent_seperator):].strip()
# remove possible sentence separator
if sent.endswith(sent_separator):
sent = sent[:-len(sent_separator)].strip()
if sent.startswith(sent_separator):
sent = sent[len(sent_separator):].strip()

# remove possible image_special_token and update
# matched_text_index for corresponding image_info
found_image_num = 0
while sent.startswith(image_special_token):
sent = sent[len(image_special_token):].strip()
found_image_num += 1
if sent.startswith(sent_seperator):
sent = sent[len(sent_seperator):].strip()
if sent.startswith(sent_separator):
sent = sent[len(sent_separator):].strip()
while sent.endswith(image_special_token):
sent = sent[:-len(image_special_token)].strip()
found_image_num += 1
if sent.endswith(sent_seperator):
sent = sent[:-len(sent_seperator)].strip()
if sent.endswith(sent_separator):
sent = sent[:-len(sent_separator)].strip()
sentences.append(sent)
if found_image_num > 0:
for _ in range(found_image_num):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def main(
target_video_chatgpt_ds_path: str,
eoc_special_token: str = SpecialTokens.eoc,
video_special_token: str = SpecialTokens.video,
sent_seperator: str = ' ',
sent_separator: str = ' ',
):
"""
Convert a Data-Juicer-format dataset to a Video-ChatGPT-like dataset.
Expand All @@ -62,7 +62,7 @@ def main(
this special token is not specified. So we simply use the default video
special token from our Data-Juicer. Default: <__dj__video> (from
Data-Juicer).
:param sent_seperator: seperator to split different sentences. Default: " "
:param sent_separator: separator to split different sentences. Default: " "
"""
# ----- Constant settings. Better not to change them. -----
text_key = 'text' # default key of field to store the sample text
Expand Down Expand Up @@ -103,7 +103,7 @@ def main(
# add question and answer
text = s.pop(text_key).strip()
text = remove_dj_special_tokens(text, eoc_special_token,
sent_seperator,
sent_separator,
video_special_token)
# get the question and answer
parts = text.split(f'[[{tgt_q_key}]]:')[1]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def main(
audio_special_token: str = SpecialTokens.audio,
remove_eoc_at_last: bool = True,
remove_target_field_token: bool = False,
sent_seperator: str = '\n',
sent_separator: str = '\n',
):
"""
Convert a Data-Juicer-format dataset to a WavCaps-like dataset.
Expand All @@ -103,7 +103,7 @@ def main(
the end of text. Default: True.
:param remove_target_field_token: whether to remove the extra
target_field_token at text.
:param sent_seperator: seperator to split different sentences. Default: \n.
:param sent_separator: separator to split different sentences. Default: \n.
"""
# ----- Constant settings. Better not to change them. -----
from_format = '[[%s]]: ' # default handle method for the text label
Expand Down Expand Up @@ -147,7 +147,7 @@ def main(
del sample[Fields.meta]['num_captions_per_audio']

sample[Fields.meta][target_field] = sample['text'].replace(
audio_special_token + sent_seperator, '')
audio_special_token + sent_separator, '')
if remove_eoc_at_last:
sample[Fields.meta][target_field] = sample[
Fields.meta][target_field].replace(eoc_special_token, '')
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def main(
target_youku_ds_path: str,
eoc_special_token: str = SpecialTokens.eoc,
video_special_token: str = SpecialTokens.video,
sent_seperator: str = ' ',
sent_separator: str = ' ',
subset_type: str = 'classification',
):
"""
Expand All @@ -84,7 +84,7 @@ def main(
this special token is not specified. So we simply use the default video
special token from our Data-Juicer. Default: <__dj__video> (from
Data-Juicer).
:param sent_seperator: seperator to split different sentences. Default: " "
:param sent_separator: separator to split different sentences. Default: " "
:param subset_type: the subset type of the input dataset. Should be one of
["pretrain", "classification", "retrieval", "captioning"]. Default:
"classification".
Expand Down Expand Up @@ -165,7 +165,7 @@ def main(
# add text, remove extra special tokens
text = s[text_key].strip()
text = remove_dj_special_tokens(text, eoc_special_token,
sent_seperator,
sent_separator,
video_special_token)
new_sample[tgt_text_key] = text

Expand Down
Loading

0 comments on commit 762805b

Please sign in to comment.