diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml new file mode 100644 index 000000000..b88d2ec96 --- /dev/null +++ b/.github/workflows/pre-commit.yml @@ -0,0 +1,14 @@ + +name: pre-commit + +on: [push, pull_request] + +jobs: + pre-commit: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v4 + with: + python-version: '3.8' + - uses: pre-commit/action@v3.0.0 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 558eb7bb2..cb4b56d42 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -4,7 +4,7 @@ repos: hooks: - id: flake8 - repo: https://github.com/PyCQA/isort.git - rev: 4.3.21 + rev: 5.12.0 hooks: - id: isort - repo: https://github.com/pre-commit/mirrors-yapf @@ -34,4 +34,10 @@ repos: exclude: thirdparty/ args: [ "--fix=lf" ] -exclude: 'docs/.*' +exclude: | + (?x)^( + docs/.*| + tests/.*| + demos/.*| + .*\.md + )$ diff --git a/data_juicer/config/config.py b/data_juicer/config/config.py index f59d8cdc9..c6eef4329 100644 --- a/data_juicer/config/config.py +++ b/data_juicer/config/config.py @@ -113,15 +113,15 @@ def init_configs(args=None): type=str, default=SpecialTokens.image, help='The special token that represents an image in the text. In ' - 'default, it\'s "<__dj__image>". You can specify your own special' - ' token according to your input dataset.') + 'default, it\'s "<__dj__image>". You can specify your own special' + ' token according to your input dataset.') parser.add_argument( '--eoc_special_token', type=str, default=SpecialTokens.eoc, help='The special token that represents the end of a chunk in the ' - 'text. In default, it\'s "<|__dj__eoc|>". You can specify your ' - 'own special token according to your input dataset.') + 'text. In default, it\'s "<|__dj__eoc|>". You can specify your ' + 'own special token according to your input dataset.') parser.add_argument( '--suffixes', type=Union[str, List[str], Tuple[str]], @@ -314,8 +314,8 @@ def init_setup_from_cfg(cfg): if os.path.isdir(cfg.dataset_path): cfg.dataset_dir = os.path.abspath(cfg.dataset_path) else: - cfg.dataset_dir = os.path.abspath( - os.path.dirname(cfg.dataset_path)) + cfg.dataset_dir = os.path.abspath(os.path.dirname( + cfg.dataset_path)) else: logger.error(f'Input dataset_path [{cfg.dataset_path}] is invalid. ' f'Please check and retry.') @@ -445,8 +445,9 @@ def config_backup(cfg): def display_config(cfg): - from tabulate import tabulate import pprint + + from tabulate import tabulate table_header = ['key', 'values'] # remove ops outside the process list for better displaying diff --git a/data_juicer/core/exporter.py b/data_juicer/core/exporter.py index 1ca6f19bf..7377e5225 100644 --- a/data_juicer/core/exporter.py +++ b/data_juicer/core/exporter.py @@ -197,7 +197,10 @@ def to_json(dataset, export_path, num_proc=1, **kwargs): :param kwargs: extra arguments. :return: """ - dataset.to_json(export_path, force_ascii=False, num_proc=num_proc, lines=False) + dataset.to_json(export_path, + force_ascii=False, + num_proc=num_proc, + lines=False) @staticmethod def to_parquet(dataset, export_path, **kwargs): diff --git a/data_juicer/core/ray_executor.py b/data_juicer/core/ray_executor.py index d1290c206..513e16151 100644 --- a/data_juicer/core/ray_executor.py +++ b/data_juicer/core/ray_executor.py @@ -67,8 +67,8 @@ def run(self, load_data_np=None): dataset = dataset.filter(op.process) else: logger.error( - 'Ray executor only support Filter and Mapper OPs for now' - ) + 'Ray executor only support Filter and Mapper OPs for ' + 'now') raise NotImplementedError except: # noqa: E722 logger.error(f'An error occurred during Op [{op_name}].') diff --git a/data_juicer/format/formatter.py b/data_juicer/format/formatter.py index a297463b7..86ea481db 100644 --- a/data_juicer/format/formatter.py +++ b/data_juicer/format/formatter.py @@ -51,9 +51,7 @@ def __init__( self.data_files = find_files_with_suffix(dataset_path, suffixes) self.add_suffix = add_suffix - def load_dataset(self, - num_proc: int = 1, - global_cfg=None) -> Dataset: + def load_dataset(self, num_proc: int = 1, global_cfg=None) -> Dataset: """ Load a dataset from dataset file or dataset directory, and unify its format. @@ -103,9 +101,7 @@ def __init__(self, self.text_keys = text_keys self.kwargs = kwargs - def load_dataset(self, - num_proc: int = 1, - global_cfg=None) -> Dataset: + def load_dataset(self, num_proc: int = 1, global_cfg=None) -> Dataset: """ Load a dataset from HuggingFace, and unify its format. @@ -226,8 +222,10 @@ def rel2abs(sample, path_keys, dataset_dir): paths = sample[path_key] if not paths: continue - new_paths = [os.path.join(dataset_dir, path) - for path in paths if not os.path.isabs(path)] + new_paths = [ + os.path.join(dataset_dir, path) for path in paths + if not os.path.isabs(path) + ] sample[path_key] = new_paths return sample @@ -240,10 +238,10 @@ def rel2abs(sample, path_keys, dataset_dir): 'dataset_dir': ds_dir }) else: - logger.warning(f'No global config passed into unify_format function. ' - f'Relative paths in the dataset might not be converted ' - f'to their absolute versions. Data of other modalities ' - f'might not be able to find by Data-Juicer.') + logger.warning('No global config passed into unify_format function. ' + 'Relative paths in the dataset might not be converted ' + 'to their absolute versions. Data of other modalities ' + 'might not be able to find by Data-Juicer.') return dataset diff --git a/data_juicer/format/text_formatter.py b/data_juicer/format/text_formatter.py index fdad34fac..0f34930c7 100644 --- a/data_juicer/format/text_formatter.py +++ b/data_juicer/format/text_formatter.py @@ -96,9 +96,7 @@ def __init__(self, self.dataset_path = dataset_path self.add_suffix = add_suffix - def load_dataset(self, - num_proc: int = 1, - global_cfg=None) -> Dataset: + def load_dataset(self, num_proc: int = 1, global_cfg=None) -> Dataset: """ Load a dataset from local text-type files. diff --git a/data_juicer/ops/base_op.py b/data_juicer/ops/base_op.py index be45a9673..4eabfdca5 100644 --- a/data_juicer/ops/base_op.py +++ b/data_juicer/ops/base_op.py @@ -2,11 +2,14 @@ OPERATORS = Registry('Operators') + class OP: - def __init__(self, - text_key: str = None, - image_key: str = None, - ): + + def __init__( + self, + text_key: str = None, + image_key: str = None, + ): """ Base class of operators. @@ -29,12 +32,14 @@ def __init__(self, def process(self, *args, **kwargs): raise NotImplementedError + class Mapper(OP): - def __init__(self, - text_key: str = None, - image_key: str = None, - ): + def __init__( + self, + text_key: str = None, + image_key: str = None, + ): """ Base class that conducts data editing. @@ -63,10 +68,11 @@ def is_batched_op(self): class Filter(OP): - def __init__(self, - text_key: str = None, - image_key: str = None, - ): + def __init__( + self, + text_key: str = None, + image_key: str = None, + ): """ Base class that removes specific info. @@ -104,10 +110,11 @@ def process(self, sample): class Deduplicator(OP): - def __init__(self, - text_key: str = None, - image_key: str = None, - ): + def __init__( + self, + text_key: str = None, + image_key: str = None, + ): """ Base class that conducts deduplication. @@ -144,10 +151,11 @@ def process(self, dataset, show_num=0): class Selector(OP): - def __init__(self, - text_key: str = None, - image_key: str = None, - ): + def __init__( + self, + text_key: str = None, + image_key: str = None, + ): """ Base class that conducts selection in dataset-level. diff --git a/data_juicer/ops/filter/character_repetition_filter.py b/data_juicer/ops/filter/character_repetition_filter.py index 3e5ce7e34..e67423030 100644 --- a/data_juicer/ops/filter/character_repetition_filter.py +++ b/data_juicer/ops/filter/character_repetition_filter.py @@ -13,7 +13,7 @@ @OPERATORS.register_module('character_repetition_filter') class CharacterRepetitionFilter(Filter): """Filter to keep samples with char-level n-gram repetition ratio within a - \ specific range.""" + specific range.""" def __init__(self, rep_len: PositiveInt = 10, diff --git a/data_juicer/ops/filter/image_aspect_ratio_filter.py b/data_juicer/ops/filter/image_aspect_ratio_filter.py index 0af4ec214..fe205e5c6 100644 --- a/data_juicer/ops/filter/image_aspect_ratio_filter.py +++ b/data_juicer/ops/filter/image_aspect_ratio_filter.py @@ -1,13 +1,11 @@ - import numpy as np - from jsonargparse.typing import PositiveFloat from data_juicer.utils.constant import Fields, StatsKeys +from data_juicer.utils.mm_utils import load_image from ..base_op import OPERATORS, Filter from ..op_fusion import LOADED_IMAGES -from data_juicer.utils.mm_utils import load_image @OPERATORS.register_module('image_aspect_ratio_filter') @@ -85,7 +83,8 @@ def process(self, sample): aspect_ratios = sample[Fields.stats][StatsKeys.aspect_ratios] keep_bools = np.array([ self.min_ratio <= aspect_ratio <= self.max_ratio - for aspect_ratio in aspect_ratios]) + for aspect_ratio in aspect_ratios + ]) if len(keep_bools) <= 0: return True @@ -94,4 +93,3 @@ def process(self, sample): return keep_bools.any() else: return keep_bools.all() - diff --git a/data_juicer/ops/filter/language_id_score_filter.py b/data_juicer/ops/filter/language_id_score_filter.py index cc3520be5..800f635cf 100644 --- a/data_juicer/ops/filter/language_id_score_filter.py +++ b/data_juicer/ops/filter/language_id_score_filter.py @@ -57,6 +57,7 @@ def compute_stats(self, sample): def process(self, sample): if self.lang: return sample[Fields.stats][StatsKeys.lang] == self.lang \ - and sample[Fields.stats][StatsKeys.lang_score] >= self.min_score + and sample[Fields.stats][StatsKeys.lang_score] >= \ + self.min_score else: return sample[Fields.stats][StatsKeys.lang_score] >= self.min_score diff --git a/data_juicer/ops/filter/special_characters_filter.py b/data_juicer/ops/filter/special_characters_filter.py index 7b6cad2a8..3b1b1a893 100644 --- a/data_juicer/ops/filter/special_characters_filter.py +++ b/data_juicer/ops/filter/special_characters_filter.py @@ -50,7 +50,8 @@ def compute_stats(self, sample): return sample def process(self, sample): - if self.min_ratio <= sample[Fields.stats][StatsKeys.special_char_ratio] \ + if self.min_ratio <= \ + sample[Fields.stats][StatsKeys.special_char_ratio] \ <= self.max_ratio: return True else: diff --git a/data_juicer/ops/filter/word_repetition_filter.py b/data_juicer/ops/filter/word_repetition_filter.py index 1d8d232ce..3883541e8 100644 --- a/data_juicer/ops/filter/word_repetition_filter.py +++ b/data_juicer/ops/filter/word_repetition_filter.py @@ -17,7 +17,7 @@ @INTER_WORDS.register_module('word_repetition_filter') class WordRepetitionFilter(Filter): """Filter to keep samples with word-level n-gram repetition ratio within a - \ specific range.""" + specific range.""" def __init__(self, lang: str = 'en', diff --git a/data_juicer/ops/mapper/punctuation_normalization_mapper.py b/data_juicer/ops/mapper/punctuation_normalization_mapper.py index e8cdf3e60..b6640e9eb 100644 --- a/data_juicer/ops/mapper/punctuation_normalization_mapper.py +++ b/data_juicer/ops/mapper/punctuation_normalization_mapper.py @@ -8,7 +8,7 @@ @OPERATORS.register_module('punctuation_normalization_mapper') class PunctuationNormalizationMapper(Mapper): """Mapper to normalize unicode punctuations to English punctuations in text - \ samples.""" + samples.""" def __init__(self, *args, **kwargs): """ diff --git a/data_juicer/ops/mapper/remove_comments_mapper.py b/data_juicer/ops/mapper/remove_comments_mapper.py index f49bc9065..b3533dd2b 100644 --- a/data_juicer/ops/mapper/remove_comments_mapper.py +++ b/data_juicer/ops/mapper/remove_comments_mapper.py @@ -14,7 +14,7 @@ class RemoveCommentsMapper(Mapper): """ Mapper to remove comments in different kinds of documents. - Only support 'tex' \ for now. + Only support 'tex' for now. """ def __init__(self, diff --git a/data_juicer/utils/mm_utils.py b/data_juicer/utils/mm_utils.py index b67484062..6ed0931e0 100644 --- a/data_juicer/utils/mm_utils.py +++ b/data_juicer/utils/mm_utils.py @@ -1,8 +1,8 @@ - from datasets import Image from data_juicer.utils.constant import DEFAULT_PREFIX + # A class to keep special tokens for multimodal information in the texts # The tokens in this class can be updated by corresponding arguments in config class SpecialTokens(object): @@ -12,9 +12,11 @@ class SpecialTokens(object): # others eoc = f'<|{DEFAULT_PREFIX}eoc|>' + def load_images(paths): return [load_image(path) for path in paths] + def load_image(path): img_feature = Image() img = img_feature.decode_example(img_feature.encode_example(path)) diff --git a/docs/DeveloperGuide.md b/docs/DeveloperGuide.md index bcaa3450a..2051f26b7 100644 --- a/docs/DeveloperGuide.md +++ b/docs/DeveloperGuide.md @@ -29,6 +29,12 @@ pre-commit run --all-files git commit -m "xxxx" ``` +**Note**: We have configured pre-commit checks in github workflow. If this +check in your PR fails, please locally ① ensure that the relevant +dependencies of pre-commit are consistent with the project configuration +(which can be completed through `pre-commit clean` and `pre-commit install`); +and ② execute `pre-commit run --all-files` before push. + ## Build your own ops - Data-Juicer allows everybody to build their own ops. diff --git a/docs/DeveloperGuide_ZH.md b/docs/DeveloperGuide_ZH.md index eb5f8e2a6..b3349067b 100644 --- a/docs/DeveloperGuide_ZH.md +++ b/docs/DeveloperGuide_ZH.md @@ -28,6 +28,8 @@ pre-commit run --all-files git commit -m "" ``` +**注意**:我们在github workflow配置了pre-commit的检查。如果您的PR中该检查没通过,请在本地①确保pre-commit 的相关依赖与项目配置一致(可通过`pre-commit clean`和`pre-commit install`完成);②push前执行了`pre-commit run --all-files`. + ## 构建自己的算子 - Data-Juicer 支持每个人定义自己的算子。 diff --git a/tools/converter/convert_gpt_to_transformers.py b/tools/converter/convert_gpt_to_transformers.py index 751866181..612ada6fc 100644 --- a/tools/converter/convert_gpt_to_transformers.py +++ b/tools/converter/convert_gpt_to_transformers.py @@ -28,16 +28,13 @@ import json import os import re -import sys -import types import torch -from transformers import AutoTokenizer, LlamaConfig +from modeling_megatron_llama import MegatronLlamaConfig +from transformers import AutoTokenizer from transformers.modeling_utils import (WEIGHTS_INDEX_NAME, WEIGHTS_NAME, shard_checkpoint) -from modeling_megatron_llama import MegatronLlamaConfig - def add_checkpointing_args(parser): parser.add_argument('--megatron-path', @@ -65,21 +62,21 @@ def add_transformers_checkpoint_args(parser): '--tokenizer_name', type=str, default=None, - help= - ('The name of the pre-trained tokenizer to save. ' - 'If not None, the tokenizer will be saved. ' - 'Only used when converting a Megatron checkpoint to a Transformers checkpoint.' - ), + help=( + 'The name of the pre-trained tokenizer to save. ' + 'If not None, the tokenizer will be saved. ' + 'Only used when converting a Megatron checkpoint to a Transformers' + ' checkpoint.'), ) parser.add_argument( '--max_shard_size', type=str, default='10GB', - help= - ('The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size ' - 'lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`). ' - 'Only used when converting a Megatron checkpoint to a Transformers checkpoint.' - ), + help='The maximum size for a checkpoint before being sharded. ' + 'Checkpoints shard will then be each of size lower than this ' + 'size. If expressed as a string, needs to be digits followed by ' + 'a unit (like `5MB`). Only used when converting a Megatron ' + 'checkpoint to a Transformers checkpoint.', ) return parser @@ -122,12 +119,14 @@ def add_transformers_checkpoint_args(parser): def recursive_print(name, val, spaces=0): """ - Recursively print the structure of a checkpoint. This function is taken from `convert_megatron_gpt2_checkpoint.py` + Recursively print the structure of a checkpoint. This function is taken + from `convert_megatron_gpt2_checkpoint.py` Args: name (str): the name of the current tensor parameter val (Tuple(int)): the shape of the current tensor parameter - spaces (int): the number of spaces to print before the output for a nested structure + spaces (int): the number of spaces to print before the output for a + nested structure """ # Format the message. if name is None: @@ -151,16 +150,20 @@ def recursive_print(name, val, spaces=0): def megatron_to_transformers_fix_query_key_value_ordering( param, checkpoint_version, num_splits, num_heads, hidden_size): """ - Permutes layout of param tensor to [num_splits * num_heads * hidden_size, :] for compatibility with later versions - of NVIDIA Megatron-LM. The inverse operation is performed inside Megatron-LM to read checkpoints: - https://github.com/NVIDIA/Megatron-LM/blob/v2.4/megatron/checkpointing.py#L209 If param is the weight tensor of the - self-attention block, the returned tensor will have to be transposed one more time to be read by HuggingFace GPT2. - This function is taken from `convert_megatron_gpt2_checkpoint.py` + Permutes layout of param tensor to + [num_splits * num_heads * hidden_size, :] for compatibility with later + versions of NVIDIA Megatron-LM. The inverse operation is performed inside + Megatron-LM to read checkpoints: + https://github.com/NVIDIA/Megatron-LM/blob/v2.4/megatron/checkpointing.py#L209 + If param is the weight tensor of the self-attention block, the returned + tensor will have to be transposed one more time to be read by HuggingFace + GPT2. This function is taken from `convert_megatron_gpt2_checkpoint.py` Args: param (torch.Tensor): the tensor to permute checkpoint_version (int): the version of the checkpoint. - num_splits (int): the number of projections, usually 3 for (Query, Key, Value) + num_splits (int): the number of projections, usually 3 for + (Query, Key, Value) num_heads (int): the number of attention heads hidden_size (int): the hidden size per head """ @@ -184,15 +187,19 @@ def megatron_to_transformers_fix_query_key_value_ordering( def transformers_to_megatron_fix_query_key_value_ordering( param, checkpoint_version, num_splits, num_heads, hidden_size): """ - Permutes layout of param tensor to the one compatible with respective NVIDIA Megatron-LM chekpoint versions. Input - is [num_splits * num_heads * hidden_size, :] and output is [num_heads * hidden_size * num_splits, :] for version - 1.0 and [num_heads * num_splits * hidden_size, :] for version 2.0 and later. If param is the weight tensor of the - self-attention block, the param needs to be already transposed before calling this function. + Permutes layout of param tensor to the one compatible with respective + NVIDIA Megatron-LM chekpoint versions. Input is + [num_splits * num_heads * hidden_size, :] and output is + [num_heads * hidden_size * num_splits, :] for version 1.0 and + [num_heads * num_splits * hidden_size, :] for version 2.0 and later. If + param is the weight tensor of the self-attention block, the param needs to + be already transposed before calling this function. Args: param (torch.Tensor): the tensor to permute checkpoint_version (int): the version of the checkpoint. - num_splits (int): the number of projections, usually 3 for (Query, Key, Value) + num_splits (int): the number of projections, usually 3 for + (Query, Key, Value) num_heads (int): the number of attention heads hidden_size (int): the hidden size per head """ @@ -233,8 +240,9 @@ def merge_transformers_sharded_states(path, num_checkpoints): def get_megatron_sharded_states(args, tp_size, pp_size, pp_rank): """ - Get sharded checkpoints from NVIDIA Megatron-LM checkpoint based on the provided tensor parallel size, pipeline - parallel size and pipeline parallel rank. + Get sharded checkpoints from NVIDIA Megatron-LM checkpoint based on the + provided tensor parallel size, pipeline parallel size and pipeline parallel + rank. Args: args (argparse.Namespace): the arguments to the script @@ -244,7 +252,8 @@ def get_megatron_sharded_states(args, tp_size, pp_size, pp_rank): """ tp_state_dicts = [] for i in range(tp_size): - sub_dir_name = f'mp_rank_{i:02d}' if pp_size == 1 else f'mp_rank_{i:02d}_{pp_rank:03d}' + sub_dir_name = f'mp_rank_{i:02d}' if pp_size == 1 \ + else f'mp_rank_{i:02d}_{pp_rank:03d}' checkpoint_name = os.listdir(os.path.join(args.load_path, sub_dir_name))[0] checkpoint_path = os.path.join(args.load_path, sub_dir_name, @@ -256,7 +265,8 @@ def get_megatron_sharded_states(args, tp_size, pp_size, pp_rank): def get_element_from_dict_by_path(d, path): """ - Get element from dictionary by path. If element is not present, recursively add empty dictionaries. + Get element from dictionary by path. If element is not present, recursively + add empty dictionaries. Args: d (dict): the dictionary to get the element from @@ -272,9 +282,11 @@ def get_element_from_dict_by_path(d, path): def convert_checkpoint_from_megatron_to_transformers(args): """ - Convert NVIDIA Megatron-LM checkpoint to HuggingFace Transformers checkpoint. This handles Megatron checkpoints - with different tensor parallelism and pipeline parallelism sizes. It saves the converted checkpoint into shards - using HuggingFace Transformers checkpoint sharding functionality. This greatly extends the functionality of + Convert NVIDIA Megatron-LM checkpoint to HuggingFace Transformers + checkpoint. This handles Megatron checkpoints with different tensor + parallelism and pipeline parallelism sizes. It saves the converted + checkpoint into shards using HuggingFace Transformers checkpoint sharding + functionality. This greatly extends the functionality of `convert_megatron_gpt2_checkpoint.py` Args: @@ -290,18 +302,18 @@ def convert_checkpoint_from_megatron_to_transformers(args): rank0_checkpoint_path = os.path.join(args.load_path, sub_dir, rank0_checkpoint_name) break - print( - f'Loading Megatron-LM checkpoint arguments from: {rank0_checkpoint_path}' - ) + print(f'Loading Megatron-LM checkpoint arguments from: ' + f'{rank0_checkpoint_path}') state_dict = torch.load(rank0_checkpoint_path, map_location='cpu') megatron_args = state_dict.get('args', None) if megatron_args is None: raise ValueError( - 'Megatron-LM checkpoint does not contain arguments. This utility only supports Megatron-LM checkpoints' - ' containing all the megatron arguments. This is because it loads all config related to model' - ' architecture, the tensor and pipeline model parallel size from the checkpoint insead of user having to' - ' manually specify all the details. Please save Megatron-LM checkpoint along with all the megatron' - ' arguments to use this utility.') + 'Megatron-LM checkpoint does not contain arguments. This utility ' + 'only supports Megatron-LM checkpoints containing all the ' + 'architecture, the tensor and pipeline model parallel size from ' + 'the checkpoint insead of user having to manually specify all the ' + 'details. Please save Megatron-LM checkpoint along with all the ' + 'megatron arguments to use this utility.') # Create Transformers GPT2 config from Megatron-LM arguments if megatron_args is not None: @@ -387,7 +399,6 @@ def convert_checkpoint_from_megatron_to_transformers(args): heads = config.num_attention_heads # The hidden_size per head. hidden_size_per_head = config.hidden_size // config.num_attention_heads - n_positions = config.max_position_embeddings num_layers = config.num_hidden_layers // pp_size for pp_rank in range(pp_size): @@ -413,7 +424,7 @@ def convert_checkpoint_from_megatron_to_transformers(args): # The index of the layer. layer_idx = int(m.group(1)) + pp_rank * num_layers # The name of the operation. - # dawei: input_layernorm, self_attention, mlp, post_attention_layernorm + # dawei: input_layernorm, self_attention, mlp, post_attention_layernorm # noqa: E501 op_name = m.group(2) # Is it a weight or a bias? weight_or_bias = m.group(3) @@ -422,12 +433,12 @@ def convert_checkpoint_from_megatron_to_transformers(args): layer_name = f'model.layers.{layer_idx}' if op_name + '.' + weight_or_bias not in tensor_parallel_params: - # dawei: input_layernorm.weight, input_layernorm.bias, self_attention.dense.bias, - # dawei: self_attention_layernorm.weight, self_attention_layernorm.bias, mlp.dense_4h_to_h.bias - # dawei: post_attention_layernorm.weight, post_attention_layernorm.bias + # dawei: input_layernorm.weight, input_layernorm.bias, self_attention.dense.bias, # noqa: E501 + # dawei: self_attention_layernorm.weight, self_attention_layernorm.bias, mlp.dense_4h_to_h.bias # noqa: E501 + # dawei: post_attention_layernorm.weight, post_attention_layernorm.bias # noqa: E501 params = val.to(dtype) else: - # dawei: self_attention.query_key_value.weight, self_attention_query_value.bias, self_attention.dense.weight, + # dawei: self_attention.query_key_value.weight, self_attention_query_value.bias, self_attention.dense.weight, # noqa: E501 # mlp.dense_h_to_4h.weight, mlp.dense_h_to_4h.bias, # mlp.dense_4h_to_h.weight dim = 1 if op_name in [ @@ -438,7 +449,8 @@ def convert_checkpoint_from_megatron_to_transformers(args): # dawei: fix bug in swiglu and dense_h_to_4h.weight - if op_name == 'mlp.dense_h_to_4h' and weight_or_bias == 'weight': + if op_name == 'mlp.dense_h_to_4h' \ + and weight_or_bias == 'weight': params_list = [val] + [ get_element_from_dict_by_path(tp_state_dicts[tp_rank], f'{path}')[key] @@ -477,14 +489,15 @@ def convert_checkpoint_from_megatron_to_transformers(args): or op_name == 'self_attention.query_key_value' ) and weight_or_bias == 'weight': # dawei: (gpt2) self_attention.query_key_value.weight - out_val = megatron_to_transformers_fix_query_key_value_ordering( + out_val = megatron_to_transformers_fix_query_key_value_ordering( # noqa: E501 params, checkpoint_version, 3, heads, hidden_size_per_head, ) - # Megatron stores (3*D) x D but transformers-GPT2 expects D x 3*D. + # Megatron stores (3*D) x D but transformers-GPT2 expects + # D x 3*D. # dawei: (3*D) x D out_val = out_val.contiguous() @@ -502,7 +515,7 @@ def convert_checkpoint_from_megatron_to_transformers(args): or op_name == 'self_attention.query_key_value' ) and weight_or_bias == 'bias': # dawei: (gpt2) self_attention.query_key_value.bias - out_val = megatron_to_transformers_fix_query_key_value_ordering( + out_val = megatron_to_transformers_fix_query_key_value_ordering( # noqa: E501 params, checkpoint_version, 3, heads, hidden_size_per_head) # dawei: split in to 3 bias q_b, k_b, v_b = torch.chunk(out_val, 3, dim=0) @@ -545,8 +558,8 @@ def convert_checkpoint_from_megatron_to_transformers(args): if config.num_hidden_layers != (layer_idx + 1): raise ValueError( - f'Expected {config.num_hidden_layers} layers but found {layer_idx + 1}' - ) + f'Expected {config.num_hidden_layers} layers but found ' + f'{layer_idx + 1}') # The final layernorm. print('Converting final layernorm') @@ -598,19 +611,19 @@ def convert_checkpoint_from_megatron_to_transformers(args): torch.save(shard, os.path.join(args.save_path, shard_file)) if index is None: - print( - f'Model weights saved in {os.path.join(args.save_path, WEIGHTS_NAME)}' - ) + print(f'Model weights saved in ' + f'{os.path.join(args.save_path, WEIGHTS_NAME)}') else: save_index_file = os.path.join(args.save_path, WEIGHTS_INDEX_NAME) # Save the index as well with open(save_index_file, 'w', encoding='utf-8') as f: content = json.dumps(index, indent=2, sort_keys=True) + '\n' f.write(content) - print( - f'The model is bigger than the maximum size per checkpoint ({args.max_shard_size}) and is going to be ' - f'split in {len(shards)} checkpoint shards. You can find where each parameters has been saved in the ' - f'index located at {save_index_file}.') + print(f'The model is bigger than the maximum size per checkpoint ' + f'({args.max_shard_size}) and is going to be split in ' + f'{len(shards)} checkpoint shards. You can find where each ' + f'parameters has been saved in the index located at ' + f'{save_index_file}.') def main(): diff --git a/tools/converter/modeling_megatron_llama.py b/tools/converter/modeling_megatron_llama.py index 268856001..b74179395 100644 --- a/tools/converter/modeling_megatron_llama.py +++ b/tools/converter/modeling_megatron_llama.py @@ -89,7 +89,7 @@ def __init__( ) -############################################################################################ +############################################################################### # Copied from transformers.models.bart.modeling_bart._make_causal_mask @@ -125,7 +125,7 @@ def _expand_mask(mask: torch.Tensor, tgt_len: Optional[int] = None): """ Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. - """ + """ # noqa: E501 bsz, src_len = mask.size() tgt_len = tgt_len if tgt_len is not None else src_len @@ -169,8 +169,8 @@ def __init__(self, base=10000, device=None): super().__init__() - inv_freq = 1.0 / (base - **(torch.arange(0, dim, 2).float().to(device) / dim)) + inv_freq = 1.0 / (base**(torch.arange(0, dim, 2).float().to(device) / + dim)) # noqa: E225 self.register_buffer('inv_freq', inv_freq) # Build here to make `torch.jit.trace` work. @@ -179,7 +179,7 @@ def __init__(self, device=self.inv_freq.device, dtype=self.inv_freq.dtype) freqs = torch.einsum('i,j->ij', t, self.inv_freq) - # Different from paper, but it uses a different permutation in order to obtain the same calculation + # Different from paper, but it uses a different permutation in order to obtain the same calculation # noqa: E501 emb = torch.cat((freqs, freqs), dim=-1) self.register_buffer('cos_cached', emb.cos()[None, None, :, :], @@ -190,14 +190,14 @@ def __init__(self, def forward(self, x, seq_len=None): # x: [bs, num_attention_heads, seq_len, head_size] - # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case. + # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case. # noqa: E501 if seq_len > self.max_seq_len_cached: self.max_seq_len_cached = seq_len t = torch.arange(self.max_seq_len_cached, device=x.device, dtype=self.inv_freq.dtype) freqs = torch.einsum('i,j->ij', t, self.inv_freq) - # Different from paper, but it uses a different permutation in order to obtain the same calculation + # Different from paper, but it uses a different permutation in order to obtain the same calculation # noqa: E501 emb = torch.cat((freqs, freqs), dim=-1).to(x.device) self.register_buffer('cos_cached', emb.cos()[None, None, :, :], @@ -219,7 +219,7 @@ def rotate_half(x): def apply_rotary_pos_emb(q, k, cos, sin, position_ids): - # The first two dimensions of cos and sin are always 1, so we can `squeeze` them. + # The first two dimensions of cos and sin are always 1, so we can `squeeze` them. # noqa: E501 cos = cos.squeeze(1).squeeze(0) # [seq_len, dim] sin = sin.squeeze(1).squeeze(0) # [seq_len, dim] cos = cos[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] @@ -259,9 +259,9 @@ def __init__(self, config: MegatronLlamaConfig): self.max_position_embeddings = config.max_position_embeddings if (self.head_dim * self.num_heads) != self.hidden_size: - raise ValueError( - f'hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}' - f' and `num_heads`: {self.num_heads}).') + raise ValueError(f'hidden_size must be divisible by num_heads ' + f'(got `hidden_size`: {self.hidden_size}' + f' and `num_heads`: {self.num_heads}).') self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.use_bias) @@ -321,14 +321,15 @@ def forward( if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): raise ValueError( - f'Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is' + f'Attention weights should be of size ' + f'{(bsz, self.num_heads, q_len, kv_seq_len)}, but is' f' {attn_weights.size()}') if attention_mask is not None: if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): - raise ValueError( - f'Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}' - ) + raise ValueError(f'Attention mask should be of size ' + f'{(bsz, 1, q_len, kv_seq_len)}, but is ' + f'{attention_mask.size()}') attn_weights = attn_weights + attention_mask attn_weights = torch.max( attn_weights, @@ -343,7 +344,8 @@ def forward( if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): raise ValueError( - f'`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is' + f'`attn_output` should be of size ' + f'{(bsz, self.num_heads, q_len, self.head_dim)}, but is' f' {attn_output.size()}') attn_output = attn_output.transpose(1, 2) @@ -396,7 +398,7 @@ def forward( If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see `past_key_values`). past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states - """ + """ # noqa: E501 residual = hidden_states @@ -444,11 +446,12 @@ def forward( Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. -""" +""" # noqa: E501 @add_start_docstrings( - 'The bare LLaMA Model outputting raw hidden-states without any specific head on top.', + 'The bare LLaMA Model outputting raw hidden-states without any specific ' + 'head on top.', LLAMA_START_DOCSTRING, ) class LlamaPreTrainedModel(PreTrainedModel): @@ -535,11 +538,12 @@ def _set_gradient_checkpointing(self, module, value=False): more detail. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. -""" +""" # noqa: E501 @add_start_docstrings( - 'The bare LLaMA Model outputting raw hidden-states without any specific head on top.', + 'The bare LLaMA Model outputting raw hidden-states without any specific ' + 'head on top.', LLAMA_START_DOCSTRING, ) class LlamaModel(LlamaPreTrainedModel): @@ -548,14 +552,15 @@ class LlamaModel(LlamaPreTrainedModel): Args: config: MegatronLlamaConfig - """ + """ # noqa: E501 def __init__(self, config: MegatronLlamaConfig): super().__init__(config) self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size - # TODO: position embeddings, should be removed if rotary position embedding + # TODO: position embeddings, should be removed if rotary position + # embedding self.embed_position = nn.Embedding(config.max_sequence_length, config.hidden_size) @@ -577,7 +582,8 @@ def get_input_embeddings(self): def set_input_embeddings(self, value): self.embed_tokens = value - # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask + # Copied from + # transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length): # create causal mask @@ -617,37 +623,40 @@ def forward( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, BaseModelOutputWithPast]: - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_attentions = output_attentions if output_attentions is not None\ + else self.config.output_attentions output_hidden_states = (output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states) - use_cache = use_cache if use_cache is not None else self.config.use_cache + use_cache = use_cache if use_cache is not None \ + else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None \ + else self.config.use_return_dict # retrieve input_ids and inputs_embeds if input_ids is not None and inputs_embeds is not None: - raise ValueError( - 'You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time' - ) + raise ValueError('You cannot specify both decoder_input_ids and ' + 'decoder_inputs_embeds at the same time') elif input_ids is not None: batch_size, seq_length = input_ids.shape elif inputs_embeds is not None: batch_size, seq_length, _ = inputs_embeds.shape else: - raise ValueError( - 'You have to specify either decoder_input_ids or decoder_inputs_embeds' - ) + raise ValueError('You have to specify either decoder_input_ids or ' + 'decoder_inputs_embeds') seq_length_with_past = seq_length past_key_values_length = 0 if past_key_values is not None: past_key_values_length = past_key_values[0][0].shape[2] - seq_length_with_past = seq_length_with_past + past_key_values_length + seq_length_with_past = seq_length_with_past + \ + past_key_values_length if position_ids is None: - device = input_ids.device if input_ids is not None else inputs_embeds.device + device = input_ids.device if input_ids is not None \ + else inputs_embeds.device position_ids = torch.arange(past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, @@ -677,8 +686,8 @@ def forward( if self.gradient_checkpointing and self.training: if use_cache: logger.warning_once( - '`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...' - ) + '`use_cache=True` is incompatible with gradient ' + 'checkpointing. Setting `use_cache=False`...') use_cache = False # decoder layers @@ -823,15 +832,18 @@ def forward( >>> generate_ids = model.generate(inputs.input_ids, max_length=30) >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you." - ```""" + ```""" # noqa: E501 - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_attentions = output_attentions if output_attentions is not None\ + else self.config.output_attentions output_hidden_states = (output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = return_dict if return_dict is not None \ + else self.config.use_return_dict - # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + # decoder outputs consists of + # (dec_features, layer_state, dec_hidden, dec_attn) outputs = self.model( input_ids=input_ids, attention_mask=attention_mask, @@ -889,7 +901,8 @@ def prepare_inputs_for_generation(self, if past_key_values: position_ids = position_ids[:, -1].unsqueeze(-1) - # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + # if `inputs_embeds` are passed, we only want to use them in the 1st + # generation step if inputs_embeds is not None and past_key_values is None: model_inputs = {'inputs_embeds': inputs_embeds} else: @@ -925,7 +938,7 @@ def _reorder_cache(past_key_values, beam_idx): no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in each row of the batch). - """, + """, # noqa: E501 LLAMA_START_DOCSTRING, ) class LlamaForSequenceClassification(LlamaPreTrainedModel): @@ -965,8 +978,9 @@ def forward( Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). - """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + """ # noqa: E501 + return_dict = return_dict if return_dict is not None \ + else self.config.use_return_dict transformer_outputs = self.model( input_ids, diff --git a/tools/evaluator/evaluator.py b/tools/evaluator/evaluator.py index c783fb4e1..2be62cbc6 100644 --- a/tools/evaluator/evaluator.py +++ b/tools/evaluator/evaluator.py @@ -5,7 +5,6 @@ import time import yaml - from gpt_eval.gpt_evaluator import GPTEvaluator from recorder.wandb_writer import HelmWriter @@ -25,15 +24,13 @@ def parse_args(): def check_args(args): - if args.begin_iteration == None: - print( - f'--begin-iteration is not provided, use the value of --iteration-interval ({args.iteration_interval}).' - ) + if args.begin_iteration is None: + print(f'--begin-iteration is not provided, use the value of ' + f'--iteration-interval ({args.iteration_interval}).') args.begin_iteration = args.iteration_interval - if args.end_iteration == None: - print( - f'--end-iteration is not provided, evaluator will monitor the traning process continuously.' - ) + if args.end_iteration is None: + print('--end-iteration is not provided, evaluator will monitor the ' + 'training process continuously.') args.end_iteration = float('inf') @@ -80,8 +77,9 @@ def load_config(self): self.tokenizer_path = None else: raise NotImplementedError( - f"tokenizer type: {self.config['megatron']['tokenizer_type']} is not supported" - ) + f'tokenizer type: ' + f"{self.config['megatron']['tokenizer_type']} is not " + f'supported') self.megatron_log_path = os.path.join(self.cache_dir, 'megatron.log') if 'log_path' in self.config['megatron']: @@ -157,9 +155,8 @@ def run_megatron_server(self, iteration): print(f'Wait for megatron checkpoint {iteration}') time.sleep(self.check_iterval * 60) # setup megatron server - print( - f'Start megatron text generation server for checkpoint iter_{iteration}' - ) + print(f'Start megatron text generation server for checkpoint ' + f'iter_{iteration}') args = [ 'torchrun', '--master_addr', '127.0.0.1', '--master_port', '5950', '--nproc_per_node', @@ -181,7 +178,7 @@ def run_megatron_server(self, iteration): def stop_megatron_server(self, process, logfile): process.terminate() logfile.close() - print(f'Stop megatron text generation server') + print('Stop megatron text generation server') def run_megatron_inference(self, iteration): while not self.megatron_checkpoint_exists(iteration): diff --git a/tools/evaluator/gpt_eval/answer_generator.py b/tools/evaluator/gpt_eval/answer_generator.py index c9b5cc233..000abef20 100644 --- a/tools/evaluator/gpt_eval/answer_generator.py +++ b/tools/evaluator/gpt_eval/answer_generator.py @@ -6,13 +6,12 @@ from abc import ABC, abstractmethod import jsonlines +import openai import requests import yaml from tqdm import tqdm from transformers import AutoModelForCausalLM, AutoTokenizer -import openai - def parse_args(): parser = argparse.ArgumentParser() @@ -28,7 +27,7 @@ class AbstractGenerator(ABC): @abstractmethod def generate(self, texts, max_tokens, temperature): - raise NotImplementedError(f'GENERATE is not implemented') + raise NotImplementedError('GENERATE is not implemented') def close(self): # do nothing diff --git a/tools/evaluator/gpt_eval/gpt_evaluator.py b/tools/evaluator/gpt_eval/gpt_evaluator.py index e3d8c8adb..dd11e10a3 100644 --- a/tools/evaluator/gpt_eval/gpt_evaluator.py +++ b/tools/evaluator/gpt_eval/gpt_evaluator.py @@ -9,11 +9,10 @@ from multiprocessing import Pool import jsonlines +import openai import yaml from tqdm import tqdm -import openai - logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) @@ -96,9 +95,9 @@ def parse_score(self, review): if len(sp) == 2: return [float(sp[0]), float(sp[1])] else: - logger.error(f'Invalid score pair.') + logger.error('Invalid score pair.') return [0, 0] - except Exception as e: + except Exception: logger.error('Invalid answer') return [0, 0] diff --git a/tools/evaluator/recorder/wandb_writer.py b/tools/evaluator/recorder/wandb_writer.py index 95fa2744f..f05de8b9c 100644 --- a/tools/evaluator/recorder/wandb_writer.py +++ b/tools/evaluator/recorder/wandb_writer.py @@ -101,7 +101,8 @@ def __init__(self, self.leaderboard = leaderboard if self.leaderboard: self.leaderboard_metrics = self.conf[ - 'leaderboard_metrics'] if 'leaderboard_metrics' in self.conf else DEFAULT_HELM_METRICS + 'leaderboard_metrics'] if 'leaderboard_metrics' in self.conf \ + else DEFAULT_HELM_METRICS self.excluded_models = self.conf[ 'excluded_models'] if 'excluded_models' in self.conf else [] return @@ -324,7 +325,7 @@ def main(): else: raise NotImplementedError( f"Unsupported type for eval type {eval['eval_type']}") - if 'leaderboard' in config and config['leaderboard'] == True: + if 'leaderboard' in config and config['leaderboard'] is True: HelmWriter(project_name=config['project'], base_url=config['base_url'], leaderboard=True, diff --git a/tools/hpo/execute_hpo.py b/tools/hpo/execute_hpo.py index 7f7bb2d09..860827700 100644 --- a/tools/hpo/execute_hpo.py +++ b/tools/hpo/execute_hpo.py @@ -3,9 +3,9 @@ import wandb import yaml from jsonargparse import namespace_to_dict +from objects import get_hpo_objective from data_juicer.config import init_configs, merge_config -from objects import get_hpo_objective # 1: load the defined search space sweep_cfg_file_path = None diff --git a/tools/multimodal/data_juicer_format_to_target_format/dj_to_llava.py b/tools/multimodal/data_juicer_format_to_target_format/dj_to_llava.py index 9df971e80..b0c1495df 100644 --- a/tools/multimodal/data_juicer_format_to_target_format/dj_to_llava.py +++ b/tools/multimodal/data_juicer_format_to_target_format/dj_to_llava.py @@ -42,11 +42,11 @@ # }, # { # "from": "human", -# "value": "Is the bus driving down the street or pulled off to the side?" +# "value": "Is the bus driving down the street or pulled off to the side?" # noqa: E501 # }, # { # "from": "gpt", -# "value": "The bus is driving down the street, which is crowded with people and other vehicles." +# "value": "The bus is driving down the street, which is crowded with people and other vehicles." # noqa: E501 # } # ] # }, @@ -56,27 +56,28 @@ # Reference: # https://github.com/haotian-liu/LLaVA/blob/main/docs/Finetune_Custom_Data.md +import json import os + import fire -import json import jsonlines as jl import regex as re - -from tqdm import tqdm from loguru import logger +from tqdm import tqdm from data_juicer.utils.mm_utils import SpecialTokens + @logger.catch def main( - dj_ds_path: str, - target_llava_ds_path: str, - keep_only_first_image: bool = True, - eoc_special_token: str = SpecialTokens.eoc, - image_special_token: str = '', - sent_seperator: str = '\n', - convert_to_relative_paths: bool = False, - original_llava_ds_path: str = None, + dj_ds_path: str, + target_llava_ds_path: str, + keep_only_first_image: bool = True, + eoc_special_token: str = SpecialTokens.eoc, + image_special_token: str = '', + sent_seperator: str = '\n', + convert_to_relative_paths: bool = False, + original_llava_ds_path: str = None, ): """ Convert a Data-Juicer-format dataset to a LLaVA-like dataset. @@ -119,7 +120,7 @@ def main( f'Input dataset [{dj_ds_path}] can not be found.') if not target_llava_ds_path.endswith('.json'): raise ValueError( - f'Only support "json" target dataset file for LLaVA now.') + 'Only support "json" target dataset file for LLaVA now.') if os.path.dirname(target_llava_ds_path) \ and not os.path.exists(os.path.dirname(target_llava_ds_path)): logger.info( @@ -129,19 +130,19 @@ def main( # check if the default image special token is changed if image_special_token != '': - logger.warning(f'The image_special_token used in the original LLaVA ' - f'dataset is "". It\'s better to align the this ' - f'token. There might be some compatibility problem if ' - f'you change it.') + logger.warning('The image_special_token used in the original LLaVA ' + 'dataset is "". It\'s better to align the this ' + 'token. There might be some compatibility problem if ' + 'you change it.') # if convert_to_relative_paths is True, check if the original_llava_ds_path # is provided as well. if convert_to_relative_paths: if not original_llava_ds_path: - raise ValueError(f'When convert_to_relative_paths is set to True, ' - f'the original_llava_ds_path must be provided ' - f'for recovering the relative paths. Please ' - f'check and retry.') + raise ValueError('When convert_to_relative_paths is set to True, ' + 'the original_llava_ds_path must be provided ' + 'for recovering the relative paths. Please ' + 'check and retry.') original_llava_ds_path = os.path.abspath(original_llava_ds_path) # if provided original_llava_ds_path is the dataset file path, only # keep the directory path. @@ -201,17 +202,11 @@ def main( if sent.endswith(sent_seperator): sent = sent[:-len(sent_seperator)].strip() - conversation = { - 'from': role, - 'value': sent - } + conversation = {'from': role, 'value': sent} conversations.append(conversation) # make up the new sample - new_sample = { - 'id': id, - 'conversations': conversations - } + new_sample = {'id': id, 'conversations': conversations} if len(images) == 1: image_path = images[0] if convert_to_relative_paths: diff --git a/tools/multimodal/source_format_to_data_juicer_format/llava_to_dj.py b/tools/multimodal/source_format_to_data_juicer_format/llava_to_dj.py index 4cff127e7..58007abf2 100644 --- a/tools/multimodal/source_format_to_data_juicer_format/llava_to_dj.py +++ b/tools/multimodal/source_format_to_data_juicer_format/llava_to_dj.py @@ -27,11 +27,11 @@ # }, # { # "from": "human", -# "value": "Is the bus driving down the street or pulled off to the side?" +# "value": "Is the bus driving down the street or pulled off to the side?" # noqa: E501 # }, # { # "from": "gpt", -# "value": "The bus is driving down the street, which is crowded with people and other vehicles." +# "value": "The bus is driving down the street, which is crowded with people and other vehicles." # noqa: E501 # } # ] # }, @@ -56,30 +56,30 @@ # Reference: # https://github.com/haotian-liu/LLaVA/blob/main/docs/Finetune_Custom_Data.md +import json import os -import fire import random -import json -import jsonlines as jl -from tqdm import tqdm +import fire +import jsonlines as jl from loguru import logger +from tqdm import tqdm from data_juicer.utils.mm_utils import SpecialTokens @logger.catch def main( - llava_ds_path: str, - target_ds_path: str, - str_id: bool = True, - split_chunk: bool = False, - image_broadcast: bool = False, - image_broadcast_pos: str = 'random', - eoc_special_token: str = SpecialTokens.eoc, - image_special_token: str = '', - add_eoc_at_last: bool = True, - sent_seperator: str = '\n', + llava_ds_path: str, + target_ds_path: str, + str_id: bool = True, + split_chunk: bool = False, + image_broadcast: bool = False, + image_broadcast_pos: str = 'random', + eoc_special_token: str = SpecialTokens.eoc, + image_special_token: str = '', + add_eoc_at_last: bool = True, + sent_seperator: str = '\n', ): """ Convert a LLaVA-like dataset to the Data-Juicer format. @@ -122,7 +122,7 @@ def main( raise FileNotFoundError(f'Input LLaVA dataset [{llava_ds_path}] can ' f'not be found.') if not target_ds_path.endswith('.jsonl'): - raise ValueError(f'Only support "jsonl" target dataset file now.') + raise ValueError('Only support "jsonl" target dataset file now.') if os.path.dirname(target_ds_path) \ and not os.path.exists(os.path.dirname(target_ds_path)): logger.info(f'Create directory [{os.path.dirname(target_ds_path)}] ' @@ -139,18 +139,18 @@ def main( f'given [{image_broadcast_pos}]') # check if the default image special token is changed if image_special_token != '': - logger.warning(f'The image_special_token used in the original LLaVA ' - f'dataset is "". It\'s better to align the this ' - f'token. There might be some compatibility problem if ' - f'you change it.') + logger.warning('The image_special_token used in the original LLaVA ' + 'dataset is "". It\'s better to align the this ' + 'token. There might be some compatibility problem if ' + 'you change it.') # check whether to add the eoc special token at last if not add_eoc_at_last: - logger.warning(f'You choose not to add special eoc token at the last, ' - f'which might cause some compatibility problems for ' - f'other type of datasets (e.g. OpenFlamingo).') + logger.warning('You choose not to add special eoc token at the last, ' + 'which might cause some compatibility problems for ' + 'other type of datasets (e.g. OpenFlamingo).') # load LLaVA dataset - logger.info(f'Loading original LLaVA dataset.') + logger.info('Loading original LLaVA dataset.') llava_ds = json.load(open(llava_ds_path, 'r', encoding='utf-8')) logger.info(f'Load [{len(llava_ds)}] samples.') @@ -215,8 +215,7 @@ def main( f'[{id}] is neither before nor after the text. ' f'The position might be wrong or there is no ' f'image_special_token in this sample. Please ' - f'check and fix the dataset and retry.' - ) + f'check and fix the dataset and retry.') images.append(image) else: # whether broadcast image special token to following @@ -249,7 +248,7 @@ def main( # combine these texts together new_sent = role_human + sent_human + sent_seperator \ - + role_robot + sent_robot + + role_robot + sent_robot formatted_conversations.append(new_sent) join_sep = sent_seperator diff --git a/tools/preprocess/serialize_meta.py b/tools/preprocess/serialize_meta.py index 327132f4a..825c99746 100644 --- a/tools/preprocess/serialize_meta.py +++ b/tools/preprocess/serialize_meta.py @@ -23,8 +23,8 @@ def meta_serialize(file_name, target_file, text_key, serialized_key): :param file_name: path to source jsonl files. :param target_file: path to store the converted jsonl files. :text_key: the key corresponding to the field that will not be serialized. - :param serialized_key: the key corresponding to the field that the serialized - info saved. Default it's 'source_info'. + :param serialized_key: the key corresponding to the field that the + serialized info saved. Default it's 'source_info'. """ with open(target_file, 'w') as fw: target = {} @@ -49,8 +49,8 @@ def main(src_dir, :param target_dir: path to save the converted jsonl files. :param text_key: the key corresponding to the field that will not be serialized. Default it's 'text'. - :param serialized_key: the key corresponding to the field that the serialized - info saved. Default it's 'source_info'. + :param serialized_key: the key corresponding to the field that the + serialized info saved. Default it's 'source_info'. :param num_proc: number of process worke. Default it's 1. """ @@ -62,8 +62,8 @@ def main(src_dir, text_key = [text_key] for key in text_key: - assert key != serialized_key, "text_key '{}' cannot be the same as serialized_key.".format( - key) + assert key != serialized_key, "text_key '{}' cannot be the same as " \ + 'serialized_key.'.format(key) # check if the source directory exists if not os.path.exists(src_dir):