modelscope · HYLcool · Nov 14, 2023 · Nov 13, 2023 · Nov 13, 2023 · Nov 13, 2023
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
@@ -0,0 +1,14 @@
+
+name: pre-commit
+
+on: [push, pull_request]
+
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - uses: actions/setup-python@v4
+      with:
+        python-version: '3.8'
+    - uses: pre-commit/[email protected]
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -4,7 +4,7 @@ repos:
     hooks:
       - id: flake8
   - repo: https://github.com/PyCQA/isort.git
-    rev: 4.3.21
+    rev: 5.12.0
     hooks:
       - id: isort
   - repo: https://github.com/pre-commit/mirrors-yapf
@@ -34,4 +34,10 @@ repos:
         exclude: thirdparty/
         args: [ "--fix=lf" ]
 
-exclude: 'docs/.*'
+exclude: |
+  (?x)^(
+    docs/.*|
+    tests/.*|
+    demos/.*|
+    .*\.md
+  )$
diff --git a/data_juicer/config/config.py b/data_juicer/config/config.py
@@ -113,15 +113,15 @@ def init_configs(args=None):
         type=str,
         default=SpecialTokens.image,
         help='The special token that represents an image in the text. In '
-             'default, it\'s "<__dj__image>". You can specify your own special'
-             ' token according to your input dataset.')
+        'default, it\'s "<__dj__image>". You can specify your own special'
+        ' token according to your input dataset.')
     parser.add_argument(
         '--eoc_special_token',
         type=str,
         default=SpecialTokens.eoc,
         help='The special token that represents the end of a chunk in the '
-             'text. In default, it\'s "<|__dj__eoc|>". You can specify your '
-             'own special token according to your input dataset.')
+        'text. In default, it\'s "<|__dj__eoc|>". You can specify your '
+        'own special token according to your input dataset.')
     parser.add_argument(
         '--suffixes',
         type=Union[str, List[str], Tuple[str]],
@@ -314,8 +314,8 @@ def init_setup_from_cfg(cfg):
         if os.path.isdir(cfg.dataset_path):
             cfg.dataset_dir = os.path.abspath(cfg.dataset_path)
         else:
-            cfg.dataset_dir = os.path.abspath(
-                os.path.dirname(cfg.dataset_path))
+            cfg.dataset_dir = os.path.abspath(os.path.dirname(
+                cfg.dataset_path))
     else:
         logger.error(f'Input dataset_path [{cfg.dataset_path}] is invalid. '
                      f'Please check and retry.')
@@ -445,8 +445,9 @@ def config_backup(cfg):
 
 
 def display_config(cfg):
-    from tabulate import tabulate
     import pprint
+
+    from tabulate import tabulate
     table_header = ['key', 'values']
 
     # remove ops outside the process list for better displaying

diff --git a/data_juicer/core/exporter.py b/data_juicer/core/exporter.py
@@ -197,7 +197,10 @@ def to_json(dataset, export_path, num_proc=1, **kwargs):
         :param kwargs: extra arguments.
         :return:
         """
-        dataset.to_json(export_path, force_ascii=False, num_proc=num_proc, lines=False)
+        dataset.to_json(export_path,
+                        force_ascii=False,
+                        num_proc=num_proc,
+                        lines=False)
 
     @staticmethod
     def to_parquet(dataset, export_path, **kwargs):

diff --git a/data_juicer/core/ray_executor.py b/data_juicer/core/ray_executor.py
@@ -67,8 +67,8 @@ def run(self, load_data_np=None):
                     dataset = dataset.filter(op.process)
                 else:
                     logger.error(
-                        'Ray executor only support Filter and Mapper OPs for now'
-                    )
+                        'Ray executor only support Filter and Mapper OPs for '
+                        'now')
                     raise NotImplementedError
             except:  # noqa: E722
                 logger.error(f'An error occurred during Op [{op_name}].')

diff --git a/data_juicer/format/formatter.py b/data_juicer/format/formatter.py
@@ -51,9 +51,7 @@ def __init__(
         self.data_files = find_files_with_suffix(dataset_path, suffixes)
         self.add_suffix = add_suffix
 
-    def load_dataset(self,
-                     num_proc: int = 1,
-                     global_cfg=None) -> Dataset:
+    def load_dataset(self, num_proc: int = 1, global_cfg=None) -> Dataset:
         """
         Load a dataset from dataset file or dataset directory, and unify its
         format.
@@ -103,9 +101,7 @@ def __init__(self,
         self.text_keys = text_keys
         self.kwargs = kwargs
 
-    def load_dataset(self,
-                     num_proc: int = 1,
-                     global_cfg=None) -> Dataset:
+    def load_dataset(self, num_proc: int = 1, global_cfg=None) -> Dataset:
         """
         Load a dataset from HuggingFace, and unify its format.
 
@@ -226,8 +222,10 @@ def rel2abs(sample, path_keys, dataset_dir):
                 paths = sample[path_key]
                 if not paths:
                     continue
-                new_paths = [os.path.join(dataset_dir, path)
-                             for path in paths if not os.path.isabs(path)]
+                new_paths = [
+                    os.path.join(dataset_dir, path) for path in paths
+                    if not os.path.isabs(path)
+                ]
                 sample[path_key] = new_paths
             return sample
 
@@ -240,10 +238,10 @@ def rel2abs(sample, path_keys, dataset_dir):
                                   'dataset_dir': ds_dir
                               })
     else:
-        logger.warning(f'No global config passed into unify_format function. '
-                       f'Relative paths in the dataset might not be converted '
-                       f'to their absolute versions. Data of other modalities '
-                       f'might not be able to find by Data-Juicer.')
+        logger.warning('No global config passed into unify_format function. '
+                       'Relative paths in the dataset might not be converted '
+                       'to their absolute versions. Data of other modalities '
+                       'might not be able to find by Data-Juicer.')
 
     return dataset
 

diff --git a/data_juicer/format/text_formatter.py b/data_juicer/format/text_formatter.py
@@ -96,9 +96,7 @@ def __init__(self,
         self.dataset_path = dataset_path
         self.add_suffix = add_suffix
 
-    def load_dataset(self,
-                     num_proc: int = 1,
-                     global_cfg=None) -> Dataset:
+    def load_dataset(self, num_proc: int = 1, global_cfg=None) -> Dataset:
         """
         Load a dataset from local text-type files.
 

diff --git a/data_juicer/ops/base_op.py b/data_juicer/ops/base_op.py
@@ -2,11 +2,14 @@
 
 OPERATORS = Registry('Operators')
 
+
 class OP:
-    def __init__(self,
-                 text_key: str = None,
-                 image_key: str = None,
-                 ):
+
+    def __init__(
+        self,
+        text_key: str = None,
+        image_key: str = None,
+    ):
         """
         Base class of operators.
 
@@ -29,12 +32,14 @@ def __init__(self,
     def process(self, *args, **kwargs):
         raise NotImplementedError
 
+
 class Mapper(OP):
 
-    def __init__(self,
-                 text_key: str = None,
-                 image_key: str = None,
-                 ):
+    def __init__(
+        self,
+        text_key: str = None,
+        image_key: str = None,
+    ):
         """
         Base class that conducts data editing.
 
@@ -63,10 +68,11 @@ def is_batched_op(self):
 
 class Filter(OP):
 
-    def __init__(self,
-                 text_key: str = None,
-                 image_key: str = None,
-                 ):
+    def __init__(
+        self,
+        text_key: str = None,
+        image_key: str = None,
+    ):
         """
         Base class that removes specific info.
 
@@ -104,10 +110,11 @@ def process(self, sample):
 
 class Deduplicator(OP):
 
-    def __init__(self,
-                 text_key: str = None,
-                 image_key: str = None,
-                 ):
+    def __init__(
+        self,
+        text_key: str = None,
+        image_key: str = None,
+    ):
         """
         Base class that conducts deduplication.
 
@@ -144,10 +151,11 @@ def process(self, dataset, show_num=0):
 
 class Selector(OP):
 
-    def __init__(self,
-                 text_key: str = None,
-                 image_key: str = None,
-                 ):
+    def __init__(
+        self,
+        text_key: str = None,
+        image_key: str = None,
+    ):
         """
         Base class that conducts selection in dataset-level.
 

diff --git a/data_juicer/ops/filter/character_repetition_filter.py b/data_juicer/ops/filter/character_repetition_filter.py
@@ -13,7 +13,7 @@
 @OPERATORS.register_module('character_repetition_filter')
 class CharacterRepetitionFilter(Filter):
     """Filter to keep samples with char-level n-gram repetition ratio within a
-    \ specific range."""
+    specific range."""
 
     def __init__(self,
                  rep_len: PositiveInt = 10,

diff --git a/data_juicer/ops/filter/image_aspect_ratio_filter.py b/data_juicer/ops/filter/image_aspect_ratio_filter.py
@@ -1,13 +1,11 @@
-
 import numpy as np
-
 from jsonargparse.typing import PositiveFloat
 
 from data_juicer.utils.constant import Fields, StatsKeys
+from data_juicer.utils.mm_utils import load_image
 
 from ..base_op import OPERATORS, Filter
 from ..op_fusion import LOADED_IMAGES
-from data_juicer.utils.mm_utils import load_image
 
 
 @OPERATORS.register_module('image_aspect_ratio_filter')
@@ -85,7 +83,8 @@ def process(self, sample):
         aspect_ratios = sample[Fields.stats][StatsKeys.aspect_ratios]
         keep_bools = np.array([
             self.min_ratio <= aspect_ratio <= self.max_ratio
-            for aspect_ratio in aspect_ratios])
+            for aspect_ratio in aspect_ratios
+        ])
         if len(keep_bools) <= 0:
             return True
 
@@ -94,4 +93,3 @@ def process(self, sample):
             return keep_bools.any()
         else:
             return keep_bools.all()
-
diff --git a/data_juicer/ops/filter/language_id_score_filter.py b/data_juicer/ops/filter/language_id_score_filter.py
@@ -57,6 +57,7 @@ def compute_stats(self, sample):
     def process(self, sample):
         if self.lang:
             return sample[Fields.stats][StatsKeys.lang] == self.lang \
-                   and sample[Fields.stats][StatsKeys.lang_score] >= self.min_score
+                   and sample[Fields.stats][StatsKeys.lang_score] >= \
+                   self.min_score
         else:
             return sample[Fields.stats][StatsKeys.lang_score] >= self.min_score
diff --git a/data_juicer/ops/filter/special_characters_filter.py b/data_juicer/ops/filter/special_characters_filter.py
@@ -50,7 +50,8 @@ def compute_stats(self, sample):
         return sample
 
     def process(self, sample):
-        if self.min_ratio <= sample[Fields.stats][StatsKeys.special_char_ratio] \
+        if self.min_ratio <= \
+                sample[Fields.stats][StatsKeys.special_char_ratio] \
                 <= self.max_ratio:
             return True
         else:

diff --git a/data_juicer/ops/filter/word_repetition_filter.py b/data_juicer/ops/filter/word_repetition_filter.py
@@ -17,7 +17,7 @@
 @INTER_WORDS.register_module('word_repetition_filter')
 class WordRepetitionFilter(Filter):
     """Filter to keep samples with word-level n-gram repetition ratio within a
-    \ specific range."""
+    specific range."""
 
     def __init__(self,
                  lang: str = 'en',

diff --git a/data_juicer/ops/mapper/punctuation_normalization_mapper.py b/data_juicer/ops/mapper/punctuation_normalization_mapper.py
@@ -8,7 +8,7 @@
 @OPERATORS.register_module('punctuation_normalization_mapper')
 class PunctuationNormalizationMapper(Mapper):
     """Mapper to normalize unicode punctuations to English punctuations in text
-    \ samples."""
+    samples."""
 
     def __init__(self, *args, **kwargs):
         """

diff --git a/data_juicer/ops/mapper/remove_comments_mapper.py b/data_juicer/ops/mapper/remove_comments_mapper.py
@@ -14,7 +14,7 @@ class RemoveCommentsMapper(Mapper):
     """
     Mapper to remove comments in different kinds of documents.
 
-    Only support 'tex' \ for now.
+    Only support 'tex' for now.
     """
 
     def __init__(self,

diff --git a/data_juicer/utils/mm_utils.py b/data_juicer/utils/mm_utils.py
@@ -1,8 +1,8 @@
-
 from datasets import Image
 
 from data_juicer.utils.constant import DEFAULT_PREFIX
 
+
 # A class to keep special tokens for multimodal information in the texts
 # The tokens in this class can be updated by corresponding arguments in config
 class SpecialTokens(object):
@@ -12,9 +12,11 @@ class SpecialTokens(object):
     # others
     eoc = f'<|{DEFAULT_PREFIX}eoc|>'
 
+
 def load_images(paths):
     return [load_image(path) for path in paths]
 
+
 def load_image(path):
     img_feature = Image()
     img = img_feature.decode_example(img_feature.encode_example(path))

diff --git a/docs/DeveloperGuide.md b/docs/DeveloperGuide.md
@@ -29,6 +29,12 @@ pre-commit run --all-files
 git commit -m "xxxx"
 ```
 
+**Note**: We have configured pre-commit checks in github workflow. If this 
+check in your PR fails, please locally ① ensure that the relevant 
+dependencies of pre-commit are consistent with the project configuration 
+(which can be completed through `pre-commit clean` and `pre-commit install`); 
+and ② execute `pre-commit run --all-files` before push.
+
 ## Build your own ops
 
 - Data-Juicer allows everybody to build their own ops.

diff --git a/docs/DeveloperGuide_ZH.md b/docs/DeveloperGuide_ZH.md
@@ -28,6 +28,8 @@ pre-commit run --all-files
 git commit -m "<your_commit_message>"
 ```
 
+**注意**：我们在github workflow配置了pre-commit的检查。如果您的PR中该检查没通过，请在本地①确保pre-commit 的相关依赖与项目配置一致（可通过`pre-commit clean`和`pre-commit install`完成）；②push前执行了`pre-commit run --all-files`.
+
 ## 构建自己的算子
 
 - Data-Juicer 支持每个人定义自己的算子。