Fix (#427)

Fix some words
modelscope · Sep 12, 2024 · 762805b · 762805b
1 parent b3fb942
commit 762805b
Show file tree

Hide file tree

Showing 25 changed files with 93 additions and 93 deletions.
diff --git a/configs/data_juicer_recipes/general-video-refine-example.yaml b/configs/data_juicer_recipes/general-video-refine-example.yaml
@@ -47,7 +47,7 @@ process:
   - video_motion_score_filter:                              # Keep samples with video motion scores within a specific range.
       min_score: 0.25                                         # the minimum motion score to keep samples
       max_score: 10000.0                                      # the maximum motion score to keep samples
-      sampling_fps: 2                                         # the samplig rate of frames_per_second to compute optical flow
+      sampling_fps: 2                                         # the sampling rate of frames_per_second to compute optical flow
       any_or_all: any                                         # keep this sample when any/all videos meet the filter condition
   - video_nsfw_filter:                                      # filter samples according to the nsfw scores of videos in them
       hf_nsfw_model: Falconsai/nsfw_image_detection           # Huggingface model name for nsfw classification

diff --git a/configs/demo/bench/model_train.yaml b/configs/demo/bench/model_train.yaml
@@ -13,7 +13,7 @@ train:
     # https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/dj-competition/modelscope_sora/models/easyanimate_mm_16x256x256_pretrain.safetensors
     transformer_path: "/PATH/TO/EASYANIMATE_MODEL"
   dataset_path:
-    # The root diretory to videos. Set empty if it is the absolute path in the dataset.
+    # The root directory to videos. Set empty if it is the absolute path in the dataset.
     dataset_name: ""
     # path to the Data-Juicer dataset. Note that the root path is in "thirdparth/models/EasyAnimate"
     dataset_meta_name: "../../../outputs/demo-bench/demo-dataset-for-train.jsonl"

diff --git a/configs/demo/bench/model_train_2_epoch.yaml b/configs/demo/bench/model_train_2_epoch.yaml
@@ -13,7 +13,7 @@ train:
     # https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/dj-competition/modelscope_sora/models/easyanimate_mm_16x256x256_pretrain.safetensors
     transformer_path: "/PATH/TO/EASYANIMATE_MODEL"
   dataset_path:
-    # The root diretory to videos. Set empty if it is the absolute path in the dataset.
+    # The root directory to videos. Set empty if it is the absolute path in the dataset.
     dataset_name: ""
     # path to the Data-Juicer dataset. Note that the root path is in "thirdparth/easy_animate"
     dataset_meta_name: "../../../outputs/demo-bench/demo-dataset-with-multi-op-stats.jsonl"

diff --git a/configs/demo/sandbox/inception_eval_config.yaml b/configs/demo/sandbox/inception_eval_config.yaml
@@ -4,9 +4,9 @@ type: video_inception_evaluator
 fake_data_path: /path/to/the/generated/dj_format_dataset
 # The path to ground truth dataset. Only support for `jsonl` format. The video paths are put in the list under `videos` keys. Required when computing FVD, FID, KID, and PR.
 real_data_path: /path/to/the/groundtruth/dj_format_dataset
-# The root diretory to store the generated videos. If it is not none, the paths in jsonl file at fake_data_path are relative paths on it, else are absolute path.
+# The root directory to store the generated videos. If it is not none, the paths in jsonl file at fake_data_path are relative paths on it, else are absolute path.
 fake_mm_dir: null
-# The root diretory to store the real videos. If it is not none, the paths in jsonl file at real_data_path are relative paths on it, else are absolute path.
+# The root directory to store the real videos. If it is not none, the paths in jsonl file at real_data_path are relative paths on it, else are absolute path.
 real_mm_dir: null
 # Path to the corresponding detection model. Download the model from web if it is None.
 detector_path: null

diff --git a/data_juicer/utils/compress.py b/data_juicer/utils/compress.py
@@ -262,7 +262,7 @@ def _get_compressed_filename(self, filename: Union[Path, str]):
         """
         return str(filename) + self.compressor_extension
 
-    def _get_cache_diretory(self, ds):
+    def _get_cache_directory(self, ds):
         """
         Get dataset cache directory.
         :param ds: input dataset.
@@ -324,7 +324,7 @@ def compress(self,
             dataset should be compressed.
         :param num_proc: number of processes to compress cache files.
         """
-        # remove cache files from the list of cahce files to be compressed
+        # remove cache files from the list of cache files to be compressed
         prev_cache_names = [item['filename'] for item in prev_ds.cache_files]
         this_cache_names = [item['filename'] for item in this_ds.cache_files] \
             if this_ds else []
@@ -389,7 +389,7 @@ def decompress(self,
             `cache-` and ends with compression format.
         :param num_proc: number of processes to decompress cache files.
         """
-        cache_directory = self._get_cache_diretory(ds)
+        cache_directory = self._get_cache_directory(ds)
         if cache_directory is None:
             return
 
@@ -448,7 +448,7 @@ def cleanup_cache_files(self, ds):
         which starts with `cache-` and ends with compression format
         :param ds: input dataset.
         """
-        cache_directory = self._get_cache_diretory(ds)
+        cache_directory = self._get_cache_directory(ds)
         if cache_directory is None:
             return
         f_names = self._get_cache_file_names(

diff --git a/data_juicer/utils/constant.py b/data_juicer/utils/constant.py
@@ -23,7 +23,7 @@ class Fields(object):
     # the name of the original file from which this sample was derived.
     source_file = DEFAULT_PREFIX + 'source_file__'
 
-    # the name of diretory to store the produced multimodal data
+    # the name of directory to store the produced multimodal data
     multimodal_data_output_dir = DEFAULT_PREFIX + 'produced_data__'
 
 

diff --git a/data_juicer/utils/file_utils.py b/data_juicer/utils/file_utils.py
@@ -218,7 +218,7 @@ def add_hash_value(text, new_hash_value):
 def copy_data(from_dir, to_dir, data_path):
     """
         Copy data from from_dir/data_path to to_dir/data_path.
-        Return Ture if success.
+        Return True if success.
     """
     from_path = os.path.join(from_dir, data_path)
     to_path = os.path.join(to_dir, data_path)

diff --git a/data_juicer/utils/fingerprint_utils.py b/data_juicer/utils/fingerprint_utils.py
@@ -11,7 +11,7 @@
 
 
 class Hasher:
-    """Hasher that accepts python objets as inputs."""
+    """Hasher that accepts python objects as inputs."""
 
     dispatch: Dict = {}
 

diff --git a/tools/mm_eval/inception_metrics/README.md b/tools/mm_eval/inception_metrics/README.md
@@ -28,8 +28,8 @@ python tools/video_metrics/calc_metrics_for_dataset.py --help
 
 - `fake_data_path`: The path to generated dataset. Only support for `jsonl` format. The video paths are put in the list under `videos` keys.
 - `real_data_path`: The path to ground truth dataset. Only support for `jsonl` format. The video paths are put in the list under `videos` keys. Required when computing FVD, FID, KID, and PR.
-- `fake_mm_dir`: The root diretory to store the fake videos. If it is not none, the paths in jsonl file at fake_data_path are relative paths on it, else are absolute path.
-- `real_mm_dir`: The root diretory to store the real videos. If it is not none, the paths in jsonl file at real_data_path are relative paths on it, else are absolute path.
+- `fake_mm_dir`: The root directory to store the fake videos. If it is not none, the paths in jsonl file at fake_data_path are relative paths on it, else are absolute path.
+- `real_mm_dir`: The root directory to store the real videos. If it is not none, the paths in jsonl file at real_data_path are relative paths on it, else are absolute path.
 - `metric`: The name of metric applied, currently support `fvd2048_16f`, `fvd2048_128f`, `fvd2048_128f_subsample8f`, `kvd2048_16f`, `isv2048_ucf`, `prv2048_3n_16f`, `fid50k`, `kid50k`, `is50k`, `pr50k_3n`.
     - `fvd2048_16f`: Compute Frechet Video Distance (FVD), sample 2048 times in dataset, 16 adjacent frames each time.
     - `fvd2048_128f`: Compute Frechet Video Distance (FVD), sample 2048 times in dataset, 128 adjacent frames each time.

diff --git a/tools/mm_eval/inception_metrics/calc_metrics_for_videos.py b/tools/mm_eval/inception_metrics/calc_metrics_for_videos.py
@@ -41,10 +41,10 @@ def calc_metrics(
         :param real_data_path: The path to ground truth dataset. 
             Only support for `jsonl` format. The video paths are put 
             in the list under `videos` keys. Required when computing FVD.
-        :param fake_mm_dir: The root diretory to store the fake videos.
+        :param fake_mm_dir: The root directory to store the fake videos.
             If it is not none, the paths in jsonl file at fake_data_path
             are relative paths on it, else are absolute path.
-        :param real_mm_dir: The root diretory to store the real videos.
+        :param real_mm_dir: The root directory to store the real videos.
             If it is not none, the paths in jsonl file at real_data_path
             are relative paths on it, else are absolute path.
         :param metric: Metric to compute, can be one of 

diff --git a/tools/multimodal/data_juicer_format_to_target_format/dj_to_llava.py b/tools/multimodal/data_juicer_format_to_target_format/dj_to_llava.py
@@ -75,7 +75,7 @@ def main(
     keep_only_first_image: bool = True,
     eoc_special_token: str = SpecialTokens.eoc,
     image_special_token: str = '<image>',
-    sent_seperator: str = '\n',
+    sent_separator: str = '\n',
     restore_questions: bool = False,
     original_llava_ds_path: str = None,
 ):
@@ -95,7 +95,7 @@ def main(
         this token always be "<image>". You can change it to align with your
         own LLaVA-like datasets but should be careful of possible compatibility
         problems that come from this change. Default: <image>.
-    :param sent_seperator: seperator to split different sentences. Default: \n.
+    :param sent_separator: separator to split different sentences. Default: \n.
     :param restore_questions: need to restore human questions if only keep
         caption when converting the LLaVA-like dataset to Data-Juicer-format.
         If it's True, an extra argument original_llava_ds_path is required.
@@ -163,9 +163,9 @@ def main(
             def clean_sentence(sentence, round):
                 sentence = sentence.strip()
 
-                # remove sentence seperator
-                if sentence.endswith(sent_seperator):
-                    sentence = sentence[:-len(sent_seperator)].strip()
+                # remove sentence separator
+                if sentence.endswith(sent_separator):
+                    sentence = sentence[:-len(sent_separator)].strip()
                 # remove possible eoc_special_tokens
                 if sentence.endswith(eoc_special_token):
                     sentence = sentence[:-len(eoc_special_token)].strip()
@@ -174,12 +174,12 @@ def clean_sentence(sentence, round):
                 if round > 0 and keep_only_first_image:
                     if sentence.startswith(image_special_token):
                         sentence = sentence[len(image_special_token):].strip()
-                        if sentence.startswith(sent_seperator):
-                            sentence = sentence[len(sent_seperator):].strip()
+                        if sentence.startswith(sent_separator):
+                            sentence = sentence[len(sent_separator):].strip()
                     if sentence.endswith(image_special_token):
                         sentence = sentence[:-len(image_special_token)].strip()
-                        if sentence.endswith(sent_seperator):
-                            sentence = sentence[:-len(sent_seperator)].strip()
+                        if sentence.endswith(sent_separator):
+                            sentence = sentence[:-len(sent_separator)].strip()
                 return sentence
 
             conversations = []

diff --git a/tools/multimodal/data_juicer_format_to_target_format/dj_to_mmc4.py b/tools/multimodal/data_juicer_format_to_target_format/dj_to_mmc4.py
@@ -100,7 +100,7 @@ def main(
     target_mmc4_ds_path: str,
     eoc_special_token: str = SpecialTokens.eoc,
     image_special_token: str = SpecialTokens.image,
-    sent_seperator: str = ' ',
+    sent_separator: str = ' ',
     keep_dj_fields: bool = False,
 ):
     """
@@ -121,7 +121,7 @@ def main(
         this special token is not specified. So we simply use the default image
         special token from our Data-Juicer. Default: <__dj__image> (from
         Data-Juicer).
-    :param sent_seperator: seperator to split different sentences. Default: " "
+    :param sent_separator: separator to split different sentences. Default: " "
     :param keep_dj_fields: whether to keep intermediate fields from
         Data-Juicer, such as "images", "text", ... Default: False.
     """
@@ -201,25 +201,25 @@ def main(
                 sentences = []
                 curr_image_idx = 0
                 for text_idx, sent in enumerate(chunks):
-                    # remove possible sentence seperator
-                    if sent.endswith(sent_seperator):
-                        sent = sent[:-len(sent_seperator)].strip()
-                    if sent.startswith(sent_seperator):
-                        sent = sent[len(sent_seperator):].strip()
+                    # remove possible sentence separator
+                    if sent.endswith(sent_separator):
+                        sent = sent[:-len(sent_separator)].strip()
+                    if sent.startswith(sent_separator):
+                        sent = sent[len(sent_separator):].strip()
 
                     # remove possible image_special_token and update
                     # matched_text_index for corresponding image_info
                     found_image_num = 0
                     while sent.startswith(image_special_token):
                         sent = sent[len(image_special_token):].strip()
                         found_image_num += 1
-                        if sent.startswith(sent_seperator):
-                            sent = sent[len(sent_seperator):].strip()
+                        if sent.startswith(sent_separator):
+                            sent = sent[len(sent_separator):].strip()
                     while sent.endswith(image_special_token):
                         sent = sent[:-len(image_special_token)].strip()
                         found_image_num += 1
-                        if sent.endswith(sent_seperator):
-                            sent = sent[:-len(sent_seperator)].strip()
+                        if sent.endswith(sent_separator):
+                            sent = sent[:-len(sent_separator)].strip()
                     sentences.append(sent)
                     if found_image_num > 0:
                         for _ in range(found_image_num):

diff --git a/tools/multimodal/data_juicer_format_to_target_format/dj_to_video_chatgpt.py b/tools/multimodal/data_juicer_format_to_target_format/dj_to_video_chatgpt.py
@@ -46,7 +46,7 @@ def main(
     target_video_chatgpt_ds_path: str,
     eoc_special_token: str = SpecialTokens.eoc,
     video_special_token: str = SpecialTokens.video,
-    sent_seperator: str = ' ',
+    sent_separator: str = ' ',
 ):
     """
     Convert a Data-Juicer-format dataset to a Video-ChatGPT-like dataset.
@@ -62,7 +62,7 @@ def main(
         this special token is not specified. So we simply use the default video
         special token from our Data-Juicer. Default: <__dj__video> (from
         Data-Juicer).
-    :param sent_seperator: seperator to split different sentences. Default: " "
+    :param sent_separator: separator to split different sentences. Default: " "
     """
     # ----- Constant settings. Better not to change them. -----
     text_key = 'text'  # default key of field to store the sample text
@@ -103,7 +103,7 @@ def main(
             # add question and answer
             text = s.pop(text_key).strip()
             text = remove_dj_special_tokens(text, eoc_special_token,
-                                            sent_seperator,
+                                            sent_separator,
                                             video_special_token)
             # get the question and answer
             parts = text.split(f'[[{tgt_q_key}]]:')[1]

diff --git a/tools/multimodal/data_juicer_format_to_target_format/dj_to_wavcaps.py b/tools/multimodal/data_juicer_format_to_target_format/dj_to_wavcaps.py
@@ -84,7 +84,7 @@ def main(
     audio_special_token: str = SpecialTokens.audio,
     remove_eoc_at_last: bool = True,
     remove_target_field_token: bool = False,
-    sent_seperator: str = '\n',
+    sent_separator: str = '\n',
 ):
     """
     Convert a Data-Juicer-format dataset to a WavCaps-like dataset.
@@ -103,7 +103,7 @@ def main(
         the end of text. Default: True.
     :param remove_target_field_token: whether to remove the extra
         target_field_token at text.
-    :param sent_seperator: seperator to split different sentences. Default: \n.
+    :param sent_separator: separator to split different sentences. Default: \n.
     """
     # ----- Constant settings. Better not to change them. -----
     from_format = '[[%s]]: '  # default handle method for the text label
@@ -147,7 +147,7 @@ def main(
             del sample[Fields.meta]['num_captions_per_audio']
 
             sample[Fields.meta][target_field] = sample['text'].replace(
-                audio_special_token + sent_seperator, '')
+                audio_special_token + sent_separator, '')
             if remove_eoc_at_last:
                 sample[Fields.meta][target_field] = sample[
                     Fields.meta][target_field].replace(eoc_special_token, '')

diff --git a/tools/multimodal/data_juicer_format_to_target_format/dj_to_youku.py b/tools/multimodal/data_juicer_format_to_target_format/dj_to_youku.py
@@ -67,7 +67,7 @@ def main(
     target_youku_ds_path: str,
     eoc_special_token: str = SpecialTokens.eoc,
     video_special_token: str = SpecialTokens.video,
-    sent_seperator: str = ' ',
+    sent_separator: str = ' ',
     subset_type: str = 'classification',
 ):
     """
@@ -84,7 +84,7 @@ def main(
         this special token is not specified. So we simply use the default video
         special token from our Data-Juicer. Default: <__dj__video> (from
         Data-Juicer).
-    :param sent_seperator: seperator to split different sentences. Default: " "
+    :param sent_separator: separator to split different sentences. Default: " "
     :param subset_type: the subset type of the input dataset. Should be one of
         ["pretrain", "classification", "retrieval", "captioning"]. Default:
         "classification".
@@ -165,7 +165,7 @@ def main(
                 # add text, remove extra special tokens
                 text = s[text_key].strip()
                 text = remove_dj_special_tokens(text, eoc_special_token,
-                                                sent_seperator,
+                                                sent_separator,
                                                 video_special_token)
                 new_sample[tgt_text_key] = text