diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
new file mode 100644
index 000000000..b88d2ec96
--- /dev/null
+++ b/.github/workflows/pre-commit.yml
@@ -0,0 +1,14 @@
+
+name: pre-commit
+
+on: [push, pull_request]
+
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - uses: actions/setup-python@v4
+      with:
+        python-version: '3.8'
+    - uses: pre-commit/action@v3.0.0
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 558eb7bb2..cb4b56d42 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -4,7 +4,7 @@ repos:
     hooks:
       - id: flake8
   - repo: https://github.com/PyCQA/isort.git
-    rev: 4.3.21
+    rev: 5.12.0
     hooks:
       - id: isort
   - repo: https://github.com/pre-commit/mirrors-yapf
@@ -34,4 +34,10 @@ repos:
         exclude: thirdparty/
         args: [ "--fix=lf" ]
 
-exclude: 'docs/.*'
+exclude: |
+  (?x)^(
+    docs/.*|
+    tests/.*|
+    demos/.*|
+    .*\.md
+  )$
diff --git a/data_juicer/config/config.py b/data_juicer/config/config.py
index f59d8cdc9..c6eef4329 100644
--- a/data_juicer/config/config.py
+++ b/data_juicer/config/config.py
@@ -113,15 +113,15 @@ def init_configs(args=None):
         type=str,
         default=SpecialTokens.image,
         help='The special token that represents an image in the text. In '
-             'default, it\'s "<__dj__image>". You can specify your own special'
-             ' token according to your input dataset.')
+        'default, it\'s "<__dj__image>". You can specify your own special'
+        ' token according to your input dataset.')
     parser.add_argument(
         '--eoc_special_token',
         type=str,
         default=SpecialTokens.eoc,
         help='The special token that represents the end of a chunk in the '
-             'text. In default, it\'s "<|__dj__eoc|>". You can specify your '
-             'own special token according to your input dataset.')
+        'text. In default, it\'s "<|__dj__eoc|>". You can specify your '
+        'own special token according to your input dataset.')
     parser.add_argument(
         '--suffixes',
         type=Union[str, List[str], Tuple[str]],
@@ -314,8 +314,8 @@ def init_setup_from_cfg(cfg):
         if os.path.isdir(cfg.dataset_path):
             cfg.dataset_dir = os.path.abspath(cfg.dataset_path)
         else:
-            cfg.dataset_dir = os.path.abspath(
-                os.path.dirname(cfg.dataset_path))
+            cfg.dataset_dir = os.path.abspath(os.path.dirname(
+                cfg.dataset_path))
     else:
         logger.error(f'Input dataset_path [{cfg.dataset_path}] is invalid. '
                      f'Please check and retry.')
@@ -445,8 +445,9 @@ def config_backup(cfg):
 
 
 def display_config(cfg):
-    from tabulate import tabulate
     import pprint
+
+    from tabulate import tabulate
     table_header = ['key', 'values']
 
     # remove ops outside the process list for better displaying
diff --git a/data_juicer/core/exporter.py b/data_juicer/core/exporter.py
index 1ca6f19bf..7377e5225 100644
--- a/data_juicer/core/exporter.py
+++ b/data_juicer/core/exporter.py
@@ -197,7 +197,10 @@ def to_json(dataset, export_path, num_proc=1, **kwargs):
         :param kwargs: extra arguments.
         :return:
         """
-        dataset.to_json(export_path, force_ascii=False, num_proc=num_proc, lines=False)
+        dataset.to_json(export_path,
+                        force_ascii=False,
+                        num_proc=num_proc,
+                        lines=False)
 
     @staticmethod
     def to_parquet(dataset, export_path, **kwargs):
diff --git a/data_juicer/core/ray_executor.py b/data_juicer/core/ray_executor.py
index d1290c206..513e16151 100644
--- a/data_juicer/core/ray_executor.py
+++ b/data_juicer/core/ray_executor.py
@@ -67,8 +67,8 @@ def run(self, load_data_np=None):
                     dataset = dataset.filter(op.process)
                 else:
                     logger.error(
-                        'Ray executor only support Filter and Mapper OPs for now'
-                    )
+                        'Ray executor only support Filter and Mapper OPs for '
+                        'now')
                     raise NotImplementedError
             except:  # noqa: E722
                 logger.error(f'An error occurred during Op [{op_name}].')
diff --git a/data_juicer/format/formatter.py b/data_juicer/format/formatter.py
index a297463b7..86ea481db 100644
--- a/data_juicer/format/formatter.py
+++ b/data_juicer/format/formatter.py
@@ -51,9 +51,7 @@ def __init__(
         self.data_files = find_files_with_suffix(dataset_path, suffixes)
         self.add_suffix = add_suffix
 
-    def load_dataset(self,
-                     num_proc: int = 1,
-                     global_cfg=None) -> Dataset:
+    def load_dataset(self, num_proc: int = 1, global_cfg=None) -> Dataset:
         """
         Load a dataset from dataset file or dataset directory, and unify its
         format.
@@ -103,9 +101,7 @@ def __init__(self,
         self.text_keys = text_keys
         self.kwargs = kwargs
 
-    def load_dataset(self,
-                     num_proc: int = 1,
-                     global_cfg=None) -> Dataset:
+    def load_dataset(self, num_proc: int = 1, global_cfg=None) -> Dataset:
         """
         Load a dataset from HuggingFace, and unify its format.
 
@@ -226,8 +222,10 @@ def rel2abs(sample, path_keys, dataset_dir):
                 paths = sample[path_key]
                 if not paths:
                     continue
-                new_paths = [os.path.join(dataset_dir, path)
-                             for path in paths if not os.path.isabs(path)]
+                new_paths = [
+                    os.path.join(dataset_dir, path) for path in paths
+                    if not os.path.isabs(path)
+                ]
                 sample[path_key] = new_paths
             return sample
 
@@ -240,10 +238,10 @@ def rel2abs(sample, path_keys, dataset_dir):
                                   'dataset_dir': ds_dir
                               })
     else:
-        logger.warning(f'No global config passed into unify_format function. '
-                       f'Relative paths in the dataset might not be converted '
-                       f'to their absolute versions. Data of other modalities '
-                       f'might not be able to find by Data-Juicer.')
+        logger.warning('No global config passed into unify_format function. '
+                       'Relative paths in the dataset might not be converted '
+                       'to their absolute versions. Data of other modalities '
+                       'might not be able to find by Data-Juicer.')
 
     return dataset
 
diff --git a/data_juicer/format/text_formatter.py b/data_juicer/format/text_formatter.py
index fdad34fac..0f34930c7 100644
--- a/data_juicer/format/text_formatter.py
+++ b/data_juicer/format/text_formatter.py
@@ -96,9 +96,7 @@ def __init__(self,
         self.dataset_path = dataset_path
         self.add_suffix = add_suffix
 
-    def load_dataset(self,
-                     num_proc: int = 1,
-                     global_cfg=None) -> Dataset:
+    def load_dataset(self, num_proc: int = 1, global_cfg=None) -> Dataset:
         """
         Load a dataset from local text-type files.
 
diff --git a/data_juicer/ops/base_op.py b/data_juicer/ops/base_op.py
index be45a9673..4eabfdca5 100644
--- a/data_juicer/ops/base_op.py
+++ b/data_juicer/ops/base_op.py
@@ -2,11 +2,14 @@
 
 OPERATORS = Registry('Operators')
 
+
 class OP:
-    def __init__(self,
-                 text_key: str = None,
-                 image_key: str = None,
-                 ):
+
+    def __init__(
+        self,
+        text_key: str = None,
+        image_key: str = None,
+    ):
         """
         Base class of operators.
 
@@ -29,12 +32,14 @@ def __init__(self,
     def process(self, *args, **kwargs):
         raise NotImplementedError
 
+
 class Mapper(OP):
 
-    def __init__(self,
-                 text_key: str = None,
-                 image_key: str = None,
-                 ):
+    def __init__(
+        self,
+        text_key: str = None,
+        image_key: str = None,
+    ):
         """
         Base class that conducts data editing.
 
@@ -63,10 +68,11 @@ def is_batched_op(self):
 
 class Filter(OP):
 
-    def __init__(self,
-                 text_key: str = None,
-                 image_key: str = None,
-                 ):
+    def __init__(
+        self,
+        text_key: str = None,
+        image_key: str = None,
+    ):
         """
         Base class that removes specific info.
 
@@ -104,10 +110,11 @@ def process(self, sample):
 
 class Deduplicator(OP):
 
-    def __init__(self,
-                 text_key: str = None,
-                 image_key: str = None,
-                 ):
+    def __init__(
+        self,
+        text_key: str = None,
+        image_key: str = None,
+    ):
         """
         Base class that conducts deduplication.
 
@@ -144,10 +151,11 @@ def process(self, dataset, show_num=0):
 
 class Selector(OP):
 
-    def __init__(self,
-                 text_key: str = None,
-                 image_key: str = None,
-                 ):
+    def __init__(
+        self,
+        text_key: str = None,
+        image_key: str = None,
+    ):
         """
         Base class that conducts selection in dataset-level.
 
diff --git a/data_juicer/ops/filter/character_repetition_filter.py b/data_juicer/ops/filter/character_repetition_filter.py
index 3e5ce7e34..e67423030 100644
--- a/data_juicer/ops/filter/character_repetition_filter.py
+++ b/data_juicer/ops/filter/character_repetition_filter.py
@@ -13,7 +13,7 @@
 @OPERATORS.register_module('character_repetition_filter')
 class CharacterRepetitionFilter(Filter):
     """Filter to keep samples with char-level n-gram repetition ratio within a
-    \ specific range."""
+    specific range."""
 
     def __init__(self,
                  rep_len: PositiveInt = 10,
diff --git a/data_juicer/ops/filter/image_aspect_ratio_filter.py b/data_juicer/ops/filter/image_aspect_ratio_filter.py
index 0af4ec214..fe205e5c6 100644
--- a/data_juicer/ops/filter/image_aspect_ratio_filter.py
+++ b/data_juicer/ops/filter/image_aspect_ratio_filter.py
@@ -1,13 +1,11 @@
-
 import numpy as np
-
 from jsonargparse.typing import PositiveFloat
 
 from data_juicer.utils.constant import Fields, StatsKeys
+from data_juicer.utils.mm_utils import load_image
 
 from ..base_op import OPERATORS, Filter
 from ..op_fusion import LOADED_IMAGES
-from data_juicer.utils.mm_utils import load_image
 
 
 @OPERATORS.register_module('image_aspect_ratio_filter')
@@ -85,7 +83,8 @@ def process(self, sample):
         aspect_ratios = sample[Fields.stats][StatsKeys.aspect_ratios]
         keep_bools = np.array([
             self.min_ratio <= aspect_ratio <= self.max_ratio
-            for aspect_ratio in aspect_ratios])
+            for aspect_ratio in aspect_ratios
+        ])
         if len(keep_bools) <= 0:
             return True
 
@@ -94,4 +93,3 @@ def process(self, sample):
             return keep_bools.any()
         else:
             return keep_bools.all()
-
diff --git a/data_juicer/ops/filter/language_id_score_filter.py b/data_juicer/ops/filter/language_id_score_filter.py
index cc3520be5..800f635cf 100644
--- a/data_juicer/ops/filter/language_id_score_filter.py
+++ b/data_juicer/ops/filter/language_id_score_filter.py
@@ -57,6 +57,7 @@ def compute_stats(self, sample):
     def process(self, sample):
         if self.lang:
             return sample[Fields.stats][StatsKeys.lang] == self.lang \
-                   and sample[Fields.stats][StatsKeys.lang_score] >= self.min_score
+                   and sample[Fields.stats][StatsKeys.lang_score] >= \
+                   self.min_score
         else:
             return sample[Fields.stats][StatsKeys.lang_score] >= self.min_score
diff --git a/data_juicer/ops/filter/special_characters_filter.py b/data_juicer/ops/filter/special_characters_filter.py
index 7b6cad2a8..3b1b1a893 100644
--- a/data_juicer/ops/filter/special_characters_filter.py
+++ b/data_juicer/ops/filter/special_characters_filter.py
@@ -50,7 +50,8 @@ def compute_stats(self, sample):
         return sample
 
     def process(self, sample):
-        if self.min_ratio <= sample[Fields.stats][StatsKeys.special_char_ratio] \
+        if self.min_ratio <= \
+                sample[Fields.stats][StatsKeys.special_char_ratio] \
                 <= self.max_ratio:
             return True
         else:
diff --git a/data_juicer/ops/filter/word_repetition_filter.py b/data_juicer/ops/filter/word_repetition_filter.py
index 1d8d232ce..3883541e8 100644
--- a/data_juicer/ops/filter/word_repetition_filter.py
+++ b/data_juicer/ops/filter/word_repetition_filter.py
@@ -17,7 +17,7 @@
 @INTER_WORDS.register_module('word_repetition_filter')
 class WordRepetitionFilter(Filter):
     """Filter to keep samples with word-level n-gram repetition ratio within a
-    \ specific range."""
+    specific range."""
 
     def __init__(self,
                  lang: str = 'en',
diff --git a/data_juicer/ops/mapper/punctuation_normalization_mapper.py b/data_juicer/ops/mapper/punctuation_normalization_mapper.py
index e8cdf3e60..b6640e9eb 100644
--- a/data_juicer/ops/mapper/punctuation_normalization_mapper.py
+++ b/data_juicer/ops/mapper/punctuation_normalization_mapper.py
@@ -8,7 +8,7 @@
 @OPERATORS.register_module('punctuation_normalization_mapper')
 class PunctuationNormalizationMapper(Mapper):
     """Mapper to normalize unicode punctuations to English punctuations in text
-    \ samples."""
+    samples."""
 
     def __init__(self, *args, **kwargs):
         """
diff --git a/data_juicer/ops/mapper/remove_comments_mapper.py b/data_juicer/ops/mapper/remove_comments_mapper.py
index f49bc9065..b3533dd2b 100644
--- a/data_juicer/ops/mapper/remove_comments_mapper.py
+++ b/data_juicer/ops/mapper/remove_comments_mapper.py
@@ -14,7 +14,7 @@ class RemoveCommentsMapper(Mapper):
     """
     Mapper to remove comments in different kinds of documents.
 
-    Only support 'tex' \ for now.
+    Only support 'tex' for now.
     """
 
     def __init__(self,
diff --git a/data_juicer/utils/mm_utils.py b/data_juicer/utils/mm_utils.py
index b67484062..6ed0931e0 100644
--- a/data_juicer/utils/mm_utils.py
+++ b/data_juicer/utils/mm_utils.py
@@ -1,8 +1,8 @@
-
 from datasets import Image
 
 from data_juicer.utils.constant import DEFAULT_PREFIX
 
+
 # A class to keep special tokens for multimodal information in the texts
 # The tokens in this class can be updated by corresponding arguments in config
 class SpecialTokens(object):
@@ -12,9 +12,11 @@ class SpecialTokens(object):
     # others
     eoc = f'<|{DEFAULT_PREFIX}eoc|>'
 
+
 def load_images(paths):
     return [load_image(path) for path in paths]
 
+
 def load_image(path):
     img_feature = Image()
     img = img_feature.decode_example(img_feature.encode_example(path))
diff --git a/docs/DeveloperGuide.md b/docs/DeveloperGuide.md
index bcaa3450a..2051f26b7 100644
--- a/docs/DeveloperGuide.md
+++ b/docs/DeveloperGuide.md
@@ -29,6 +29,12 @@ pre-commit run --all-files
 git commit -m "xxxx"
 ```
 
+**Note**: We have configured pre-commit checks in github workflow. If this 
+check in your PR fails, please locally ① ensure that the relevant 
+dependencies of pre-commit are consistent with the project configuration 
+(which can be completed through `pre-commit clean` and `pre-commit install`); 
+and ② execute `pre-commit run --all-files` before push.
+
 ## Build your own ops
 
 - Data-Juicer allows everybody to build their own ops.
diff --git a/docs/DeveloperGuide_ZH.md b/docs/DeveloperGuide_ZH.md
index eb5f8e2a6..b3349067b 100644
--- a/docs/DeveloperGuide_ZH.md
+++ b/docs/DeveloperGuide_ZH.md
@@ -28,6 +28,8 @@ pre-commit run --all-files
 git commit -m "<your_commit_message>"
 ```
 
+**注意**：我们在github workflow配置了pre-commit的检查。如果您的PR中该检查没通过，请在本地①确保pre-commit 的相关依赖与项目配置一致（可通过`pre-commit clean`和`pre-commit install`完成）；②push前执行了`pre-commit run --all-files`.
+
 ## 构建自己的算子
 
 - Data-Juicer 支持每个人定义自己的算子。
diff --git a/tools/converter/convert_gpt_to_transformers.py b/tools/converter/convert_gpt_to_transformers.py
index 751866181..612ada6fc 100644
--- a/tools/converter/convert_gpt_to_transformers.py
+++ b/tools/converter/convert_gpt_to_transformers.py
@@ -28,16 +28,13 @@
 import json
 import os
 import re
-import sys
-import types
 
 import torch
-from transformers import AutoTokenizer, LlamaConfig
+from modeling_megatron_llama import MegatronLlamaConfig
+from transformers import AutoTokenizer
 from transformers.modeling_utils import (WEIGHTS_INDEX_NAME, WEIGHTS_NAME,
                                          shard_checkpoint)
 
-from modeling_megatron_llama import MegatronLlamaConfig
-
 
 def add_checkpointing_args(parser):
     parser.add_argument('--megatron-path',
@@ -65,21 +62,21 @@ def add_transformers_checkpoint_args(parser):
         '--tokenizer_name',
         type=str,
         default=None,
-        help=
-        ('The name of the pre-trained tokenizer to save. '
-         'If not None, the tokenizer will be saved. '
-         'Only used when converting a Megatron checkpoint to a Transformers checkpoint.'
-         ),
+        help=(
+            'The name of the pre-trained tokenizer to save. '
+            'If not None, the tokenizer will be saved. '
+            'Only used when converting a Megatron checkpoint to a Transformers'
+            ' checkpoint.'),
     )
     parser.add_argument(
         '--max_shard_size',
         type=str,
         default='10GB',
-        help=
-        ('The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size '
-         'lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`). '
-         'Only used when converting a Megatron checkpoint to a Transformers checkpoint.'
-         ),
+        help='The maximum size for a checkpoint before being sharded. '
+        'Checkpoints shard will then be each of size lower than this '
+        'size. If expressed as a string, needs to be digits followed by '
+        'a unit (like `5MB`). Only used when converting a Megatron '
+        'checkpoint to a Transformers checkpoint.',
     )
 
     return parser
@@ -122,12 +119,14 @@ def add_transformers_checkpoint_args(parser):
 
 def recursive_print(name, val, spaces=0):
     """
-    Recursively print the structure of a checkpoint. This function is taken from `convert_megatron_gpt2_checkpoint.py`
+    Recursively print the structure of a checkpoint. This function is taken
+    from `convert_megatron_gpt2_checkpoint.py`
 
     Args:
         name (str): the name of the current tensor parameter
         val (Tuple(int)): the shape of the current tensor parameter
-        spaces (int): the number of spaces to print before the output for a nested structure
+        spaces (int): the number of spaces to print before the output for a
+            nested structure
     """
     # Format the message.
     if name is None:
@@ -151,16 +150,20 @@ def recursive_print(name, val, spaces=0):
 def megatron_to_transformers_fix_query_key_value_ordering(
         param, checkpoint_version, num_splits, num_heads, hidden_size):
     """
-    Permutes layout of param tensor to [num_splits * num_heads * hidden_size, :] for compatibility with later versions
-    of NVIDIA Megatron-LM. The inverse operation is performed inside Megatron-LM to read checkpoints:
-    https://github.com/NVIDIA/Megatron-LM/blob/v2.4/megatron/checkpointing.py#L209 If param is the weight tensor of the
-    self-attention block, the returned tensor will have to be transposed one more time to be read by HuggingFace GPT2.
-    This function is taken from `convert_megatron_gpt2_checkpoint.py`
+    Permutes layout of param tensor to
+    [num_splits * num_heads * hidden_size, :] for compatibility with later
+    versions of NVIDIA Megatron-LM. The inverse operation is performed inside
+    Megatron-LM to read checkpoints:
+    https://github.com/NVIDIA/Megatron-LM/blob/v2.4/megatron/checkpointing.py#L209
+    If param is the weight tensor of the self-attention block, the returned
+    tensor will have to be transposed one more time to be read by HuggingFace
+    GPT2. This function is taken from `convert_megatron_gpt2_checkpoint.py`
 
     Args:
         param (torch.Tensor): the tensor to permute
         checkpoint_version (int): the version of the checkpoint.
-        num_splits (int): the number of projections, usually 3 for (Query, Key, Value)
+        num_splits (int): the number of projections, usually 3 for
+            (Query, Key, Value)
         num_heads (int): the number of attention heads
         hidden_size (int): the hidden size per head
     """
@@ -184,15 +187,19 @@ def megatron_to_transformers_fix_query_key_value_ordering(
 def transformers_to_megatron_fix_query_key_value_ordering(
         param, checkpoint_version, num_splits, num_heads, hidden_size):
     """
-    Permutes layout of param tensor to the one compatible with respective NVIDIA Megatron-LM chekpoint versions. Input
-    is [num_splits * num_heads * hidden_size, :] and output is [num_heads * hidden_size * num_splits, :] for version
-    1.0 and [num_heads * num_splits * hidden_size, :] for version 2.0 and later. If param is the weight tensor of the
-    self-attention block, the param needs to be already transposed before calling this function.
+    Permutes layout of param tensor to the one compatible with respective
+    NVIDIA Megatron-LM chekpoint versions. Input is
+    [num_splits * num_heads * hidden_size, :] and output is
+    [num_heads * hidden_size * num_splits, :] for version 1.0 and
+    [num_heads * num_splits * hidden_size, :] for version 2.0 and later. If
+    param is the weight tensor of the self-attention block, the param needs to
+    be already transposed before calling this function.
 
     Args:
         param (torch.Tensor): the tensor to permute
         checkpoint_version (int): the version of the checkpoint.
-        num_splits (int): the number of projections, usually 3 for (Query, Key, Value)
+        num_splits (int): the number of projections, usually 3 for
+            (Query, Key, Value)
         num_heads (int): the number of attention heads
         hidden_size (int): the hidden size per head
     """
@@ -233,8 +240,9 @@ def merge_transformers_sharded_states(path, num_checkpoints):
 
 def get_megatron_sharded_states(args, tp_size, pp_size, pp_rank):
     """
-    Get sharded checkpoints from NVIDIA Megatron-LM checkpoint based on the provided tensor parallel size, pipeline
-    parallel size and pipeline parallel rank.
+    Get sharded checkpoints from NVIDIA Megatron-LM checkpoint based on the
+    provided tensor parallel size, pipeline parallel size and pipeline parallel
+    rank.
 
     Args:
         args (argparse.Namespace): the arguments to the script
@@ -244,7 +252,8 @@ def get_megatron_sharded_states(args, tp_size, pp_size, pp_rank):
     """
     tp_state_dicts = []
     for i in range(tp_size):
-        sub_dir_name = f'mp_rank_{i:02d}' if pp_size == 1 else f'mp_rank_{i:02d}_{pp_rank:03d}'
+        sub_dir_name = f'mp_rank_{i:02d}' if pp_size == 1 \
+            else f'mp_rank_{i:02d}_{pp_rank:03d}'
         checkpoint_name = os.listdir(os.path.join(args.load_path,
                                                   sub_dir_name))[0]
         checkpoint_path = os.path.join(args.load_path, sub_dir_name,
@@ -256,7 +265,8 @@ def get_megatron_sharded_states(args, tp_size, pp_size, pp_rank):
 
 def get_element_from_dict_by_path(d, path):
     """
-    Get element from dictionary by path. If element is not present, recursively add empty dictionaries.
+    Get element from dictionary by path. If element is not present, recursively
+    add empty dictionaries.
 
     Args:
         d (dict): the dictionary to get the element from
@@ -272,9 +282,11 @@ def get_element_from_dict_by_path(d, path):
 
 def convert_checkpoint_from_megatron_to_transformers(args):
     """
-    Convert NVIDIA Megatron-LM checkpoint to HuggingFace Transformers checkpoint. This handles Megatron checkpoints
-    with different tensor parallelism and pipeline parallelism sizes. It saves the converted checkpoint into shards
-    using HuggingFace Transformers checkpoint sharding functionality. This greatly extends the functionality of
+    Convert NVIDIA Megatron-LM checkpoint to HuggingFace Transformers
+    checkpoint. This handles Megatron checkpoints with different tensor
+    parallelism and pipeline parallelism sizes. It saves the converted
+    checkpoint into shards using HuggingFace Transformers checkpoint sharding
+    functionality. This greatly extends the functionality of
     `convert_megatron_gpt2_checkpoint.py`
 
     Args:
@@ -290,18 +302,18 @@ def convert_checkpoint_from_megatron_to_transformers(args):
             rank0_checkpoint_path = os.path.join(args.load_path, sub_dir,
                                                  rank0_checkpoint_name)
             break
-    print(
-        f'Loading Megatron-LM checkpoint arguments from: {rank0_checkpoint_path}'
-    )
+    print(f'Loading Megatron-LM checkpoint arguments from: '
+          f'{rank0_checkpoint_path}')
     state_dict = torch.load(rank0_checkpoint_path, map_location='cpu')
     megatron_args = state_dict.get('args', None)
     if megatron_args is None:
         raise ValueError(
-            'Megatron-LM checkpoint does not contain arguments. This utility only supports Megatron-LM checkpoints'
-            ' containing all the megatron arguments. This is because it loads all config related to model'
-            ' architecture, the tensor and pipeline model parallel size from the checkpoint insead of user having to'
-            ' manually specify all the details. Please save Megatron-LM checkpoint along with all the megatron'
-            ' arguments to use this utility.')
+            'Megatron-LM checkpoint does not contain arguments. This utility '
+            'only supports Megatron-LM checkpoints containing all the '
+            'architecture, the tensor and pipeline model parallel size from '
+            'the checkpoint insead of user having to manually specify all the '
+            'details. Please save Megatron-LM checkpoint along with all the '
+            'megatron arguments to use this utility.')
 
     # Create Transformers GPT2 config from Megatron-LM arguments
     if megatron_args is not None:
@@ -387,7 +399,6 @@ def convert_checkpoint_from_megatron_to_transformers(args):
     heads = config.num_attention_heads
     # The hidden_size per head.
     hidden_size_per_head = config.hidden_size // config.num_attention_heads
-    n_positions = config.max_position_embeddings
     num_layers = config.num_hidden_layers // pp_size
 
     for pp_rank in range(pp_size):
@@ -413,7 +424,7 @@ def convert_checkpoint_from_megatron_to_transformers(args):
             # The index of the layer.
             layer_idx = int(m.group(1)) + pp_rank * num_layers
             # The name of the operation.
-            # dawei: input_layernorm, self_attention, mlp, post_attention_layernorm
+            # dawei: input_layernorm, self_attention, mlp, post_attention_layernorm  # noqa: E501
             op_name = m.group(2)
             # Is it a weight or a bias?
             weight_or_bias = m.group(3)
@@ -422,12 +433,12 @@ def convert_checkpoint_from_megatron_to_transformers(args):
             layer_name = f'model.layers.{layer_idx}'
 
             if op_name + '.' + weight_or_bias not in tensor_parallel_params:
-                # dawei: input_layernorm.weight, input_layernorm.bias, self_attention.dense.bias,
-                # dawei: self_attention_layernorm.weight, self_attention_layernorm.bias, mlp.dense_4h_to_h.bias
-                # dawei: post_attention_layernorm.weight, post_attention_layernorm.bias
+                # dawei: input_layernorm.weight, input_layernorm.bias, self_attention.dense.bias,  # noqa: E501
+                # dawei: self_attention_layernorm.weight, self_attention_layernorm.bias, mlp.dense_4h_to_h.bias  # noqa: E501
+                # dawei: post_attention_layernorm.weight, post_attention_layernorm.bias  # noqa: E501
                 params = val.to(dtype)
             else:
-                # dawei: self_attention.query_key_value.weight, self_attention_query_value.bias, self_attention.dense.weight,
+                # dawei: self_attention.query_key_value.weight, self_attention_query_value.bias, self_attention.dense.weight,  # noqa: E501
                 #  mlp.dense_h_to_4h.weight, mlp.dense_h_to_4h.bias,
                 #  mlp.dense_4h_to_h.weight
                 dim = 1 if op_name in [
@@ -438,7 +449,8 @@ def convert_checkpoint_from_megatron_to_transformers(args):
 
                 # dawei: fix bug in swiglu and dense_h_to_4h.weight
 
-                if op_name == 'mlp.dense_h_to_4h' and weight_or_bias == 'weight':
+                if op_name == 'mlp.dense_h_to_4h' \
+                        and weight_or_bias == 'weight':
                     params_list = [val] + [
                         get_element_from_dict_by_path(tp_state_dicts[tp_rank],
                                                       f'{path}')[key]
@@ -477,14 +489,15 @@ def convert_checkpoint_from_megatron_to_transformers(args):
                   or op_name == 'self_attention.query_key_value'
                   ) and weight_or_bias == 'weight':
                 # dawei: (gpt2) self_attention.query_key_value.weight
-                out_val = megatron_to_transformers_fix_query_key_value_ordering(
+                out_val = megatron_to_transformers_fix_query_key_value_ordering(  # noqa: E501
                     params,
                     checkpoint_version,
                     3,
                     heads,
                     hidden_size_per_head,
                 )
-                # Megatron stores (3*D) x D but transformers-GPT2 expects D x 3*D.
+                # Megatron stores (3*D) x D but transformers-GPT2 expects
+                # D x 3*D.
 
                 # dawei: (3*D) x D
                 out_val = out_val.contiguous()
@@ -502,7 +515,7 @@ def convert_checkpoint_from_megatron_to_transformers(args):
                   or op_name == 'self_attention.query_key_value'
                   ) and weight_or_bias == 'bias':
                 # dawei: (gpt2) self_attention.query_key_value.bias
-                out_val = megatron_to_transformers_fix_query_key_value_ordering(
+                out_val = megatron_to_transformers_fix_query_key_value_ordering(  # noqa: E501
                     params, checkpoint_version, 3, heads, hidden_size_per_head)
                 # dawei: split in to 3 bias
                 q_b, k_b, v_b = torch.chunk(out_val, 3, dim=0)
@@ -545,8 +558,8 @@ def convert_checkpoint_from_megatron_to_transformers(args):
 
     if config.num_hidden_layers != (layer_idx + 1):
         raise ValueError(
-            f'Expected {config.num_hidden_layers} layers but found {layer_idx + 1}'
-        )
+            f'Expected {config.num_hidden_layers} layers but found '
+            f'{layer_idx + 1}')
 
     # The final layernorm.
     print('Converting final layernorm')
@@ -598,19 +611,19 @@ def convert_checkpoint_from_megatron_to_transformers(args):
         torch.save(shard, os.path.join(args.save_path, shard_file))
 
     if index is None:
-        print(
-            f'Model weights saved in {os.path.join(args.save_path, WEIGHTS_NAME)}'
-        )
+        print(f'Model weights saved in '
+              f'{os.path.join(args.save_path, WEIGHTS_NAME)}')
     else:
         save_index_file = os.path.join(args.save_path, WEIGHTS_INDEX_NAME)
         # Save the index as well
         with open(save_index_file, 'w', encoding='utf-8') as f:
             content = json.dumps(index, indent=2, sort_keys=True) + '\n'
             f.write(content)
-        print(
-            f'The model is bigger than the maximum size per checkpoint ({args.max_shard_size}) and is going to be '
-            f'split in {len(shards)} checkpoint shards. You can find where each parameters has been saved in the '
-            f'index located at {save_index_file}.')
+        print(f'The model is bigger than the maximum size per checkpoint '
+              f'({args.max_shard_size}) and is going to be split in '
+              f'{len(shards)} checkpoint shards. You can find where each '
+              f'parameters has been saved in the index located at '
+              f'{save_index_file}.')
 
 
 def main():
diff --git a/tools/converter/modeling_megatron_llama.py b/tools/converter/modeling_megatron_llama.py
index 268856001..b74179395 100644
--- a/tools/converter/modeling_megatron_llama.py
+++ b/tools/converter/modeling_megatron_llama.py
@@ -89,7 +89,7 @@ def __init__(
         )
 
 
-############################################################################################
+###############################################################################
 
 
 # Copied from transformers.models.bart.modeling_bart._make_causal_mask
@@ -125,7 +125,7 @@ def _expand_mask(mask: torch.Tensor,
                  tgt_len: Optional[int] = None):
     """
     Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
-    """
+    """  # noqa: E501
     bsz, src_len = mask.size()
     tgt_len = tgt_len if tgt_len is not None else src_len
 
@@ -169,8 +169,8 @@ def __init__(self,
                  base=10000,
                  device=None):
         super().__init__()
-        inv_freq = 1.0 / (base
-                          **(torch.arange(0, dim, 2).float().to(device) / dim))
+        inv_freq = 1.0 / (base**(torch.arange(0, dim, 2).float().to(device) /
+                                 dim))  # noqa: E225
         self.register_buffer('inv_freq', inv_freq)
 
         # Build here to make `torch.jit.trace` work.
@@ -179,7 +179,7 @@ def __init__(self,
                          device=self.inv_freq.device,
                          dtype=self.inv_freq.dtype)
         freqs = torch.einsum('i,j->ij', t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation  # noqa: E501
         emb = torch.cat((freqs, freqs), dim=-1)
         self.register_buffer('cos_cached',
                              emb.cos()[None, None, :, :],
@@ -190,14 +190,14 @@ def __init__(self,
 
     def forward(self, x, seq_len=None):
         # x: [bs, num_attention_heads, seq_len, head_size]
-        # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case.
+        # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case.  # noqa: E501
         if seq_len > self.max_seq_len_cached:
             self.max_seq_len_cached = seq_len
             t = torch.arange(self.max_seq_len_cached,
                              device=x.device,
                              dtype=self.inv_freq.dtype)
             freqs = torch.einsum('i,j->ij', t, self.inv_freq)
-            # Different from paper, but it uses a different permutation in order to obtain the same calculation
+            # Different from paper, but it uses a different permutation in order to obtain the same calculation  # noqa: E501
             emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
             self.register_buffer('cos_cached',
                                  emb.cos()[None, None, :, :],
@@ -219,7 +219,7 @@ def rotate_half(x):
 
 
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
-    # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
+    # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.  # noqa: E501
     cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
     sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
     cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
@@ -259,9 +259,9 @@ def __init__(self, config: MegatronLlamaConfig):
         self.max_position_embeddings = config.max_position_embeddings
 
         if (self.head_dim * self.num_heads) != self.hidden_size:
-            raise ValueError(
-                f'hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}'
-                f' and `num_heads`: {self.num_heads}).')
+            raise ValueError(f'hidden_size must be divisible by num_heads '
+                             f'(got `hidden_size`: {self.hidden_size}'
+                             f' and `num_heads`: {self.num_heads}).')
         self.q_proj = nn.Linear(self.hidden_size,
                                 self.num_heads * self.head_dim,
                                 bias=config.use_bias)
@@ -321,14 +321,15 @@ def forward(
 
         if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
             raise ValueError(
-                f'Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is'
+                f'Attention weights should be of size '
+                f'{(bsz, self.num_heads, q_len, kv_seq_len)}, but is'
                 f' {attn_weights.size()}')
 
         if attention_mask is not None:
             if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError(
-                    f'Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}'
-                )
+                raise ValueError(f'Attention mask should be of size '
+                                 f'{(bsz, 1, q_len, kv_seq_len)}, but is '
+                                 f'{attention_mask.size()}')
             attn_weights = attn_weights + attention_mask
             attn_weights = torch.max(
                 attn_weights,
@@ -343,7 +344,8 @@ def forward(
 
         if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
             raise ValueError(
-                f'`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is'
+                f'`attn_output` should be of size '
+                f'{(bsz, self.num_heads, q_len, self.head_dim)}, but is'
                 f' {attn_output.size()}')
 
         attn_output = attn_output.transpose(1, 2)
@@ -396,7 +398,7 @@ def forward(
                 If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                 (see `past_key_values`).
             past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
-        """
+        """  # noqa: E501
 
         residual = hidden_states
 
@@ -444,11 +446,12 @@ def forward(
             Model configuration class with all the parameters of the model. Initializing with a config file does not
             load the weights associated with the model, only the configuration. Check out the
             [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
+"""  # noqa: E501
 
 
 @add_start_docstrings(
-    'The bare LLaMA Model outputting raw hidden-states without any specific head on top.',
+    'The bare LLaMA Model outputting raw hidden-states without any specific '
+    'head on top.',
     LLAMA_START_DOCSTRING,
 )
 class LlamaPreTrainedModel(PreTrainedModel):
@@ -535,11 +538,12 @@ def _set_gradient_checkpointing(self, module, value=False):
             more detail.
         return_dict (`bool`, *optional*):
             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
+"""  # noqa: E501
 
 
 @add_start_docstrings(
-    'The bare LLaMA Model outputting raw hidden-states without any specific head on top.',
+    'The bare LLaMA Model outputting raw hidden-states without any specific '
+    'head on top.',
     LLAMA_START_DOCSTRING,
 )
 class LlamaModel(LlamaPreTrainedModel):
@@ -548,14 +552,15 @@ class LlamaModel(LlamaPreTrainedModel):
 
     Args:
         config: MegatronLlamaConfig
-    """
+    """  # noqa: E501
 
     def __init__(self, config: MegatronLlamaConfig):
         super().__init__(config)
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
 
-        # TODO: position embeddings, should be removed if rotary position embedding
+        # TODO: position embeddings, should be removed if rotary position
+        #  embedding
         self.embed_position = nn.Embedding(config.max_sequence_length,
                                            config.hidden_size)
 
@@ -577,7 +582,8 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.embed_tokens = value
 
-    # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
+    # Copied from
+    # transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
     def _prepare_decoder_attention_mask(self, attention_mask, input_shape,
                                         inputs_embeds, past_key_values_length):
         # create causal mask
@@ -617,37 +623,40 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, BaseModelOutputWithPast]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_attentions = output_attentions if output_attentions is not None\
+            else self.config.output_attentions
         output_hidden_states = (output_hidden_states
                                 if output_hidden_states is not None else
                                 self.config.output_hidden_states)
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        use_cache = use_cache if use_cache is not None \
+            else self.config.use_cache
 
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        return_dict = return_dict if return_dict is not None \
+            else self.config.use_return_dict
 
         # retrieve input_ids and inputs_embeds
         if input_ids is not None and inputs_embeds is not None:
-            raise ValueError(
-                'You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time'
-            )
+            raise ValueError('You cannot specify both decoder_input_ids and '
+                             'decoder_inputs_embeds at the same time')
         elif input_ids is not None:
             batch_size, seq_length = input_ids.shape
         elif inputs_embeds is not None:
             batch_size, seq_length, _ = inputs_embeds.shape
         else:
-            raise ValueError(
-                'You have to specify either decoder_input_ids or decoder_inputs_embeds'
-            )
+            raise ValueError('You have to specify either decoder_input_ids or '
+                             'decoder_inputs_embeds')
 
         seq_length_with_past = seq_length
         past_key_values_length = 0
 
         if past_key_values is not None:
             past_key_values_length = past_key_values[0][0].shape[2]
-            seq_length_with_past = seq_length_with_past + past_key_values_length
+            seq_length_with_past = seq_length_with_past + \
+                past_key_values_length
 
         if position_ids is None:
-            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            device = input_ids.device if input_ids is not None \
+                else inputs_embeds.device
             position_ids = torch.arange(past_key_values_length,
                                         seq_length + past_key_values_length,
                                         dtype=torch.long,
@@ -677,8 +686,8 @@ def forward(
         if self.gradient_checkpointing and self.training:
             if use_cache:
                 logger.warning_once(
-                    '`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...'
-                )
+                    '`use_cache=True` is incompatible with gradient '
+                    'checkpointing. Setting `use_cache=False`...')
                 use_cache = False
 
         # decoder layers
@@ -823,15 +832,18 @@ def forward(
         >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
         >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
         "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
-        ```"""
+        ```"""  # noqa: E501
 
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_attentions = output_attentions if output_attentions is not None\
+            else self.config.output_attentions
         output_hidden_states = (output_hidden_states
                                 if output_hidden_states is not None else
                                 self.config.output_hidden_states)
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        return_dict = return_dict if return_dict is not None \
+            else self.config.use_return_dict
 
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        # decoder outputs consists of
+        # (dec_features, layer_state, dec_hidden, dec_attn)
         outputs = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
@@ -889,7 +901,8 @@ def prepare_inputs_for_generation(self,
             if past_key_values:
                 position_ids = position_ids[:, -1].unsqueeze(-1)
 
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        # if `inputs_embeds` are passed, we only want to use them in the 1st
+        # generation step
         if inputs_embeds is not None and past_key_values is None:
             model_inputs = {'inputs_embeds': inputs_embeds}
         else:
@@ -925,7 +938,7 @@ def _reorder_cache(past_key_values, beam_idx):
     no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
     padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
     each row of the batch).
-    """,
+    """,  # noqa: E501
     LLAMA_START_DOCSTRING,
 )
 class LlamaForSequenceClassification(LlamaPreTrainedModel):
@@ -965,8 +978,9 @@ def forward(
             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
             config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
             `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        """  # noqa: E501
+        return_dict = return_dict if return_dict is not None \
+            else self.config.use_return_dict
 
         transformer_outputs = self.model(
             input_ids,
diff --git a/tools/evaluator/evaluator.py b/tools/evaluator/evaluator.py
index c783fb4e1..2be62cbc6 100644
--- a/tools/evaluator/evaluator.py
+++ b/tools/evaluator/evaluator.py
@@ -5,7 +5,6 @@
 import time
 
 import yaml
-
 from gpt_eval.gpt_evaluator import GPTEvaluator
 from recorder.wandb_writer import HelmWriter
 
@@ -25,15 +24,13 @@ def parse_args():
 
 
 def check_args(args):
-    if args.begin_iteration == None:
-        print(
-            f'--begin-iteration is not provided, use the value of --iteration-interval ({args.iteration_interval}).'
-        )
+    if args.begin_iteration is None:
+        print(f'--begin-iteration is not provided, use the value of '
+              f'--iteration-interval ({args.iteration_interval}).')
         args.begin_iteration = args.iteration_interval
-    if args.end_iteration == None:
-        print(
-            f'--end-iteration is not provided, evaluator will monitor the traning process continuously.'
-        )
+    if args.end_iteration is None:
+        print('--end-iteration is not provided, evaluator will monitor the '
+              'training process continuously.')
         args.end_iteration = float('inf')
 
 
@@ -80,8 +77,9 @@ def load_config(self):
                 self.tokenizer_path = None
             else:
                 raise NotImplementedError(
-                    f"tokenizer type: {self.config['megatron']['tokenizer_type']} is not supported"
-                )
+                    f'tokenizer type: '
+                    f"{self.config['megatron']['tokenizer_type']} is not "
+                    f'supported')
             self.megatron_log_path = os.path.join(self.cache_dir,
                                                   'megatron.log')
             if 'log_path' in self.config['megatron']:
@@ -157,9 +155,8 @@ def run_megatron_server(self, iteration):
             print(f'Wait for megatron checkpoint {iteration}')
             time.sleep(self.check_iterval * 60)
         # setup megatron server
-        print(
-            f'Start megatron text generation server for checkpoint iter_{iteration}'
-        )
+        print(f'Start megatron text generation server for checkpoint '
+              f'iter_{iteration}')
         args = [
             'torchrun', '--master_addr', '127.0.0.1', '--master_port', '5950',
             '--nproc_per_node',
@@ -181,7 +178,7 @@ def run_megatron_server(self, iteration):
     def stop_megatron_server(self, process, logfile):
         process.terminate()
         logfile.close()
-        print(f'Stop megatron text generation server')
+        print('Stop megatron text generation server')
 
     def run_megatron_inference(self, iteration):
         while not self.megatron_checkpoint_exists(iteration):
diff --git a/tools/evaluator/gpt_eval/answer_generator.py b/tools/evaluator/gpt_eval/answer_generator.py
index c9b5cc233..000abef20 100644
--- a/tools/evaluator/gpt_eval/answer_generator.py
+++ b/tools/evaluator/gpt_eval/answer_generator.py
@@ -6,13 +6,12 @@
 from abc import ABC, abstractmethod
 
 import jsonlines
+import openai
 import requests
 import yaml
 from tqdm import tqdm
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
-import openai
-
 
 def parse_args():
     parser = argparse.ArgumentParser()
@@ -28,7 +27,7 @@ class AbstractGenerator(ABC):
 
     @abstractmethod
     def generate(self, texts, max_tokens, temperature):
-        raise NotImplementedError(f'GENERATE is not implemented')
+        raise NotImplementedError('GENERATE is not implemented')
 
     def close(self):
         # do nothing
diff --git a/tools/evaluator/gpt_eval/gpt_evaluator.py b/tools/evaluator/gpt_eval/gpt_evaluator.py
index e3d8c8adb..dd11e10a3 100644
--- a/tools/evaluator/gpt_eval/gpt_evaluator.py
+++ b/tools/evaluator/gpt_eval/gpt_evaluator.py
@@ -9,11 +9,10 @@
 from multiprocessing import Pool
 
 import jsonlines
+import openai
 import yaml
 from tqdm import tqdm
 
-import openai
-
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO)
 
@@ -96,9 +95,9 @@ def parse_score(self, review):
             if len(sp) == 2:
                 return [float(sp[0]), float(sp[1])]
             else:
-                logger.error(f'Invalid score pair.')
+                logger.error('Invalid score pair.')
                 return [0, 0]
-        except Exception as e:
+        except Exception:
             logger.error('Invalid answer')
             return [0, 0]
 
diff --git a/tools/evaluator/recorder/wandb_writer.py b/tools/evaluator/recorder/wandb_writer.py
index 95fa2744f..f05de8b9c 100644
--- a/tools/evaluator/recorder/wandb_writer.py
+++ b/tools/evaluator/recorder/wandb_writer.py
@@ -101,7 +101,8 @@ def __init__(self,
         self.leaderboard = leaderboard
         if self.leaderboard:
             self.leaderboard_metrics = self.conf[
-                'leaderboard_metrics'] if 'leaderboard_metrics' in self.conf else DEFAULT_HELM_METRICS
+                'leaderboard_metrics'] if 'leaderboard_metrics' in self.conf \
+                else DEFAULT_HELM_METRICS
             self.excluded_models = self.conf[
                 'excluded_models'] if 'excluded_models' in self.conf else []
             return
@@ -324,7 +325,7 @@ def main():
         else:
             raise NotImplementedError(
                 f"Unsupported type for eval type {eval['eval_type']}")
-    if 'leaderboard' in config and config['leaderboard'] == True:
+    if 'leaderboard' in config and config['leaderboard'] is True:
         HelmWriter(project_name=config['project'],
                    base_url=config['base_url'],
                    leaderboard=True,
diff --git a/tools/hpo/execute_hpo.py b/tools/hpo/execute_hpo.py
index 7f7bb2d09..860827700 100644
--- a/tools/hpo/execute_hpo.py
+++ b/tools/hpo/execute_hpo.py
@@ -3,9 +3,9 @@
 import wandb
 import yaml
 from jsonargparse import namespace_to_dict
+from objects import get_hpo_objective
 
 from data_juicer.config import init_configs, merge_config
-from objects import get_hpo_objective
 
 # 1: load the defined search space
 sweep_cfg_file_path = None
diff --git a/tools/multimodal/data_juicer_format_to_target_format/dj_to_llava.py b/tools/multimodal/data_juicer_format_to_target_format/dj_to_llava.py
index 9df971e80..b0c1495df 100644
--- a/tools/multimodal/data_juicer_format_to_target_format/dj_to_llava.py
+++ b/tools/multimodal/data_juicer_format_to_target_format/dj_to_llava.py
@@ -42,11 +42,11 @@
 #       },
 #       {
 #         "from": "human",
-#         "value": "Is the bus driving down the street or pulled off to the side?"
+#         "value": "Is the bus driving down the street or pulled off to the side?"  # noqa: E501
 #       },
 #       {
 #         "from": "gpt",
-#         "value": "The bus is driving down the street, which is crowded with people and other vehicles."
+#         "value": "The bus is driving down the street, which is crowded with people and other vehicles."  # noqa: E501
 #       }
 #     ]
 #   },
@@ -56,27 +56,28 @@
 # Reference:
 # https://github.com/haotian-liu/LLaVA/blob/main/docs/Finetune_Custom_Data.md
 
+import json
 import os
+
 import fire
-import json
 import jsonlines as jl
 import regex as re
-
-from tqdm import tqdm
 from loguru import logger
+from tqdm import tqdm
 
 from data_juicer.utils.mm_utils import SpecialTokens
 
+
 @logger.catch
 def main(
-        dj_ds_path: str,
-        target_llava_ds_path: str,
-        keep_only_first_image: bool = True,
-        eoc_special_token: str = SpecialTokens.eoc,
-        image_special_token: str = '<image>',
-        sent_seperator: str = '\n',
-        convert_to_relative_paths: bool = False,
-        original_llava_ds_path: str = None,
+    dj_ds_path: str,
+    target_llava_ds_path: str,
+    keep_only_first_image: bool = True,
+    eoc_special_token: str = SpecialTokens.eoc,
+    image_special_token: str = '<image>',
+    sent_seperator: str = '\n',
+    convert_to_relative_paths: bool = False,
+    original_llava_ds_path: str = None,
 ):
     """
     Convert a Data-Juicer-format dataset to a LLaVA-like dataset.
@@ -119,7 +120,7 @@ def main(
             f'Input dataset [{dj_ds_path}] can not be found.')
     if not target_llava_ds_path.endswith('.json'):
         raise ValueError(
-            f'Only support "json" target dataset file for LLaVA now.')
+            'Only support "json" target dataset file for LLaVA now.')
     if os.path.dirname(target_llava_ds_path) \
             and not os.path.exists(os.path.dirname(target_llava_ds_path)):
         logger.info(
@@ -129,19 +130,19 @@ def main(
 
     # check if the default image special token is changed
     if image_special_token != '<image>':
-        logger.warning(f'The image_special_token used in the original LLaVA '
-                       f'dataset is "<image>". It\'s better to align the this '
-                       f'token. There might be some compatibility problem if '
-                       f'you change it.')
+        logger.warning('The image_special_token used in the original LLaVA '
+                       'dataset is "<image>". It\'s better to align the this '
+                       'token. There might be some compatibility problem if '
+                       'you change it.')
 
     # if convert_to_relative_paths is True, check if the original_llava_ds_path
     # is provided as well.
     if convert_to_relative_paths:
         if not original_llava_ds_path:
-            raise ValueError(f'When convert_to_relative_paths is set to True, '
-                             f'the original_llava_ds_path must be provided '
-                             f'for recovering the relative paths. Please '
-                             f'check and retry.')
+            raise ValueError('When convert_to_relative_paths is set to True, '
+                             'the original_llava_ds_path must be provided '
+                             'for recovering the relative paths. Please '
+                             'check and retry.')
         original_llava_ds_path = os.path.abspath(original_llava_ds_path)
         # if provided original_llava_ds_path is the dataset file path, only
         # keep the directory path.
@@ -201,17 +202,11 @@ def main(
                         if sent.endswith(sent_seperator):
                             sent = sent[:-len(sent_seperator)].strip()
 
-                conversation = {
-                    'from': role,
-                    'value': sent
-                }
+                conversation = {'from': role, 'value': sent}
                 conversations.append(conversation)
 
             # make up the new sample
-            new_sample = {
-                'id': id,
-                'conversations': conversations
-            }
+            new_sample = {'id': id, 'conversations': conversations}
             if len(images) == 1:
                 image_path = images[0]
                 if convert_to_relative_paths:
diff --git a/tools/multimodal/source_format_to_data_juicer_format/llava_to_dj.py b/tools/multimodal/source_format_to_data_juicer_format/llava_to_dj.py
index 4cff127e7..58007abf2 100644
--- a/tools/multimodal/source_format_to_data_juicer_format/llava_to_dj.py
+++ b/tools/multimodal/source_format_to_data_juicer_format/llava_to_dj.py
@@ -27,11 +27,11 @@
 #       },
 #       {
 #         "from": "human",
-#         "value": "Is the bus driving down the street or pulled off to the side?"
+#         "value": "Is the bus driving down the street or pulled off to the side?"  # noqa: E501
 #       },
 #       {
 #         "from": "gpt",
-#         "value": "The bus is driving down the street, which is crowded with people and other vehicles."
+#         "value": "The bus is driving down the street, which is crowded with people and other vehicles."  # noqa: E501
 #       }
 #     ]
 #   },
@@ -56,30 +56,30 @@
 # Reference:
 # https://github.com/haotian-liu/LLaVA/blob/main/docs/Finetune_Custom_Data.md
 
+import json
 import os
-import fire
 import random
-import json
-import jsonlines as jl
 
-from tqdm import tqdm
+import fire
+import jsonlines as jl
 from loguru import logger
+from tqdm import tqdm
 
 from data_juicer.utils.mm_utils import SpecialTokens
 
 
 @logger.catch
 def main(
-        llava_ds_path: str,
-        target_ds_path: str,
-        str_id: bool = True,
-        split_chunk: bool = False,
-        image_broadcast: bool = False,
-        image_broadcast_pos: str = 'random',
-        eoc_special_token: str = SpecialTokens.eoc,
-        image_special_token: str = '<image>',
-        add_eoc_at_last: bool = True,
-        sent_seperator: str = '\n',
+    llava_ds_path: str,
+    target_ds_path: str,
+    str_id: bool = True,
+    split_chunk: bool = False,
+    image_broadcast: bool = False,
+    image_broadcast_pos: str = 'random',
+    eoc_special_token: str = SpecialTokens.eoc,
+    image_special_token: str = '<image>',
+    add_eoc_at_last: bool = True,
+    sent_seperator: str = '\n',
 ):
     """
     Convert a LLaVA-like dataset to the Data-Juicer format.
@@ -122,7 +122,7 @@ def main(
         raise FileNotFoundError(f'Input LLaVA dataset [{llava_ds_path}] can '
                                 f'not be found.')
     if not target_ds_path.endswith('.jsonl'):
-        raise ValueError(f'Only support "jsonl" target dataset file now.')
+        raise ValueError('Only support "jsonl" target dataset file now.')
     if os.path.dirname(target_ds_path) \
             and not os.path.exists(os.path.dirname(target_ds_path)):
         logger.info(f'Create directory [{os.path.dirname(target_ds_path)}] '
@@ -139,18 +139,18 @@ def main(
                              f'given [{image_broadcast_pos}]')
     # check if the default image special token is changed
     if image_special_token != '<image>':
-        logger.warning(f'The image_special_token used in the original LLaVA '
-                       f'dataset is "<image>". It\'s better to align the this '
-                       f'token. There might be some compatibility problem if '
-                       f'you change it.')
+        logger.warning('The image_special_token used in the original LLaVA '
+                       'dataset is "<image>". It\'s better to align the this '
+                       'token. There might be some compatibility problem if '
+                       'you change it.')
     # check whether to add the eoc special token at last
     if not add_eoc_at_last:
-        logger.warning(f'You choose not to add special eoc token at the last, '
-                       f'which might cause some compatibility problems for '
-                       f'other type of datasets (e.g. OpenFlamingo).')
+        logger.warning('You choose not to add special eoc token at the last, '
+                       'which might cause some compatibility problems for '
+                       'other type of datasets (e.g. OpenFlamingo).')
 
     # load LLaVA dataset
-    logger.info(f'Loading original LLaVA dataset.')
+    logger.info('Loading original LLaVA dataset.')
     llava_ds = json.load(open(llava_ds_path, 'r', encoding='utf-8'))
     logger.info(f'Load [{len(llava_ds)}] samples.')
 
@@ -215,8 +215,7 @@ def main(
                             f'[{id}] is neither before nor after the text. '
                             f'The position might be wrong or there is no '
                             f'image_special_token in this sample. Please '
-                            f'check and fix the dataset and retry.'
-                        )
+                            f'check and fix the dataset and retry.')
                     images.append(image)
                 else:
                     # whether broadcast image special token to following
@@ -249,7 +248,7 @@ def main(
 
                 # combine these texts together
                 new_sent = role_human + sent_human + sent_seperator \
-                           + role_robot + sent_robot
+                    + role_robot + sent_robot
                 formatted_conversations.append(new_sent)
 
             join_sep = sent_seperator
diff --git a/tools/preprocess/serialize_meta.py b/tools/preprocess/serialize_meta.py
index 327132f4a..825c99746 100644
--- a/tools/preprocess/serialize_meta.py
+++ b/tools/preprocess/serialize_meta.py
@@ -23,8 +23,8 @@ def meta_serialize(file_name, target_file, text_key, serialized_key):
     :param file_name: path to source jsonl files.
     :param target_file: path to store the converted jsonl files.
     :text_key: the key corresponding to the field that will not be serialized.
-    :param serialized_key: the key corresponding to the field that the serialized
-    info saved. Default it's 'source_info'.
+    :param serialized_key: the key corresponding to the field that the
+        serialized info saved. Default it's 'source_info'.
     """
     with open(target_file, 'w') as fw:
         target = {}
@@ -49,8 +49,8 @@ def main(src_dir,
     :param target_dir: path to save the converted jsonl files.
     :param text_key: the key corresponding to the field that will not be
     serialized. Default it's 'text'.
-    :param serialized_key: the key corresponding to the field that the serialized
-    info saved. Default it's 'source_info'.
+    :param serialized_key: the key corresponding to the field that the
+        serialized info saved. Default it's 'source_info'.
     :param num_proc: number of process worke. Default it's 1.
     """
 
@@ -62,8 +62,8 @@ def main(src_dir,
         text_key = [text_key]
 
     for key in text_key:
-        assert key != serialized_key, "text_key '{}' cannot be the same as serialized_key.".format(
-            key)
+        assert key != serialized_key, "text_key '{}' cannot be the same as " \
+                                      'serialized_key.'.format(key)
 
     # check if the source directory exists
     if not os.path.exists(src_dir):