Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
Qirui-jiao committed Sep 2, 2024
2 parents ad39967 + 8ea47b7 commit 6f79643
Show file tree
Hide file tree
Showing 25 changed files with 842 additions and 29 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,4 @@ dist
.idea/
wandb/
__pycache__
.vscode/
30 changes: 29 additions & 1 deletion configs/config_all.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,29 @@ process:
- clean_copyright_mapper: # remove copyright comments.
- expand_macro_mapper: # expand macro definitions in Latex text.
- extract_qa_mapper: # mapper to extract question and answer pair from text.
hf_model: 'alibaba-pai/pai-qwen1_5-7b-doc2qa'
hf_model: 'alibaba-pai/pai-qwen1_5-7b-doc2qa' # model name on huggingface to extract question and answer pair.
pattern: null # regular expression pattern to search for within text.
qa_format: 'chatml' # Output format of question and answer pair.
enable_vllm: true # Whether to use vllm for inference acceleration.
tensor_parallel_size: null # It is only valid when enable_vllm is True. The number of GPUs to use for distributed execution with tensor parallelism.
max_model_len: null # It is only valid when enable_vllm is True. Model context length. If unspecified, will be automatically derived from the model config.
max_num_seqs: 256 # It is only valid when enable_vllm is True. Maximum number of sequences to be processed in a single iteration.
sampling_params: {} # Sampling parameters for text generation. e.g {'temperature': 0.9, 'top_p': 0.95}
- fix_unicode_mapper: # fix unicode errors in text.
- generate_instruction_mapper: # generate new instruction text data.
hf_model: 'Qwen/Qwen-7B-Chat' # model name on huggingface to generate instruction.
seed_file: 'demos/data/demo-dataset-chatml.jsonl' # Seed file as instruction samples to generate new instructions, chatml format.
instruct_num: 3 # the number of generated samples.
similarity_threshold: 0.7 # the similarity score threshold between the generated samples and the seed samples.Range from 0 to 1. Samples with similarity score less than this threshold will be kept.
prompt_template: null # Prompt template for generate samples. Please make sure the template contains "{augmented_data}", which corresponds to the augmented samples.
qa_pair_template: null # Prompt template for generate question and answer pair description. Please make sure the template contains two "{}" to format question and answer. Default: '【问题】\n{}\n【回答】\n{}\n'.
example_template: null # Prompt template for generate examples. Please make sure the template contains "{qa_pairs}", which corresponds to the question and answer pair description generated by param `qa_pair_template`.
qa_extraction_pattern: null # Regular expression pattern for parsing question and answer from model response.
enable_vllm: true # Whether to use vllm for inference acceleration.
tensor_parallel_size: null # It is only valid when enable_vllm is True. The number of GPUs to use for distributed execution with tensor parallelism.
max_model_len: null # It is only valid when enable_vllm is True. Model context length. If unspecified, will be automatically derived from the model config.
max_num_seqs: 256 # It is only valid when enable_vllm is True. Maximum number of sequences to be processed in a single iteration.
sampling_params: {} # Sampling parameters for text generation. e.g {'temperature': 0.9, 'top_p': 0.95}
- image_blur_mapper: # mapper to blur images.
p: 0.2 # probability of the image being blured
blur_type: 'gaussian' # type of blur kernel, including ['mean', 'box', 'gaussian']
Expand Down Expand Up @@ -123,6 +144,13 @@ process:
delete_random_char: false # whether to open the augmentation method of deleting random characters from the original texts. e.g. "这里一共有5种不同的数据增强方法" --> "这里一共有5种不同的数据增强"
swap_random_char: false # whether to open the augmentation method of swapping random contiguous characters in the original texts. e.g. "这里一共有5种不同的数据增强方法" --> "这里一共有5种不同的数据强增方法"
replace_equivalent_num: false # whether to open the augmentation method of replacing random numbers with their equivalent representations in the original texts. **Notice**: Only for numbers for now. e.g. "这里一共有5种不同的数据增强方法" --> "这里一共有伍种不同的数据增强方法"
- optimize_instruction_mapper: # optimize instruction.
hf_model: 'alibaba-pai/Qwen2-7B-Instruct-Refine' # model name on huggingface to optimize instruction
enable_vllm: true # whether to use vllm for inference acceleration.
tensor_parallel_size: null # It is only valid when enable_vllm is True. The number of GPUs to use for distributed execution with tensor parallelism.
max_model_len: null # It is only valid when enable_vllm is True. Model context length. If unspecified, will be automatically derived from the model config.
max_num_seqs: 256 # It is only valid when enable_vllm is True. Maximum number of sequences to be processed in a single iteration.
sampling_params: {} # Sampling parameters for text generation. e.g {'temperature': 0.9, 'top_p': 0.95}
- punctuation_normalization_mapper: # normalize unicode punctuations to English punctuations.
- remove_bibliography_mapper: # remove bibliography from Latex text.
- remove_comments_mapper: # remove comments from Latex text, code, etc.
Expand Down
File renamed without changes.
9 changes: 8 additions & 1 deletion data_juicer/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,13 @@ def init_configs(args=None):
help='Path to datasets with optional weights(0.0-1.0), 1.0 as '
'default. Accepted format:<w1> dataset1-path <w2> dataset2-path '
'<w3> dataset3-path ...')
parser.add_argument(
'--generated_dataset_config',
type=Dict,
default=None,
help='Configuration used to create a dataset. '
'The dataset will be created from this configuration if provided. '
'It must contain the `type` field to specify the dataset name.')
parser.add_argument(
'--export_path',
type=str,
Expand Down Expand Up @@ -371,7 +378,7 @@ def init_setup_from_cfg(cfg):
redirect=cfg.executor_type == 'default')

# check and get dataset dir
if os.path.exists(cfg.dataset_path):
if cfg.get('dataset_path', None) and os.path.exists(cfg.dataset_path):
cfg.dataset_path = os.path.abspath(cfg.dataset_path)
if os.path.isdir(cfg.dataset_path):
cfg.dataset_dir = cfg.dataset_path
Expand Down
1 change: 1 addition & 0 deletions data_juicer/core/executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ def __init__(self, cfg=None):
# setup formatter
logger.info('Setting up data formatter...')
self.formatter = load_formatter(self.cfg.dataset_path,
self.cfg.generated_dataset_config,
self.cfg.text_keys, self.cfg.suffixes,
self.cfg.add_suffix)

Expand Down
12 changes: 11 additions & 1 deletion data_juicer/core/ray_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,17 @@ def run(self, load_data_np=None):
"""
# 1. load data
logger.info('Loading dataset with Ray...')
dataset = rd.read_json(self.cfg.dataset_path)

if self.cfg.get('generated_dataset_config', None):
generated_dataset_config = self.cfg.generated_dataset_config
assert isinstance(generated_dataset_config,
dict) and 'type' in generated_dataset_config
args = generated_dataset_config.copy()
obj_name = args.pop('type')
from data_juicer.format.formatter import FORMATTERS
dataset = FORMATTERS.modules[obj_name](**args).load_dataset()
else:
dataset = rd.read_json(self.cfg.dataset_path)

# convert all the path in dataset to absolute path
dataset = RayDataset(dataset, self.cfg.dataset_path, self.cfg)
Expand Down
8 changes: 5 additions & 3 deletions data_juicer/format/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from . import (csv_formatter, json_formatter, mixture_formatter,
parquet_formatter, text_formatter, tsv_formatter)
from . import (csv_formatter, empty_formatter, json_formatter,
mixture_formatter, parquet_formatter, text_formatter,
tsv_formatter)
from .csv_formatter import CsvFormatter
from .empty_formatter import EmptyFormatter, RayEmptyFormatter
from .formatter import LocalFormatter, RemoteFormatter
from .json_formatter import JsonFormatter
from .load import load_formatter
Expand All @@ -12,5 +14,5 @@
__all__ = [
'load_formatter', 'JsonFormatter', 'LocalFormatter', 'RemoteFormatter',
'TextFormatter', 'ParquetFormatter', 'CsvFormatter', 'TsvFormatter',
'MixtureFormatter'
'MixtureFormatter', 'EmptyFormatter', 'RayEmptyFormatter'
]
84 changes: 84 additions & 0 deletions data_juicer/format/empty_formatter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
from typing import List

import pandas as pd
import ray
from datasets import Dataset, Features, Value

from .formatter import FORMATTERS, BaseFormatter


@FORMATTERS.register_module()
class EmptyFormatter(BaseFormatter):
"""
The class is used to create empty data.
"""
SUFFIXES = []

def __init__(self, length, feature_keys: List[str] = [], *args, **kwargs):
"""
Initialization method.
:param length: The empty dataset length.
:param feature_keys: feature key name list.
"""
self.length = length
self.feature_keys = feature_keys
if isinstance(self.feature_keys, str):
self.feature_keys = [self.feature_keys]

@property
def null_value(self):
return None

def load_dataset(self, *args, **kwargs):
data_dict = {}
features = Features()

for key in self.feature_keys:
features.update({key: Value('string')})
data_dict.update(
{key: [self.null_value for _ in range(self.length)]})

empty_dataset = Dataset.from_dict(data_dict, features=features)

from data_juicer.core.data import NestedDataset
empty_dataset = NestedDataset(empty_dataset)

return empty_dataset


@FORMATTERS.register_module()
class RayEmptyFormatter(BaseFormatter):
"""
The class is used to create empty data for ray.
"""
SUFFIXES = []

def __init__(self, length, feature_keys: List[str] = [], *args, **kwargs):
"""
Initialization method.
:param length: The empty dataset length.
:param feature_keys: feature key name list.
"""
self.length = length
self.feature_keys = feature_keys
if isinstance(self.feature_keys, str):
self.feature_keys = [self.feature_keys]

@property
def null_value(self):
return {}

def load_dataset(self, *args, **kwargs):
if len(self.feature_keys):
df = pd.DataFrame({
col: [self.null_value for _ in range(self.length)]
for col in self.feature_keys
})
else:
df = pd.DataFrame([self.null_value for _ in range(self.length)])

empty_dataset = ray.data.from_pandas(df)

return empty_dataset
14 changes: 14 additions & 0 deletions data_juicer/format/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@


def load_formatter(dataset_path,
generated_dataset_config=None,
text_keys=None,
suffixes=[],
add_suffix=False,
Expand All @@ -12,13 +13,26 @@ def load_formatter(dataset_path,
weight(default 1.0) according to their formats.
:param dataset_path: path to a dataset file or a dataset directory
:param generated_dataset_config: Configuration used to create a dataset.
The dataset will be created from this configuration if provided.
It must contain the `type` field to specify the dataset name.
:param text_keys: key names of field that stores sample text.
Default: None
:param suffixes: files with specified suffixes to be processed.
:param add_suffix: whether to add the file suffix to dataset meta
info
:return: a dataset formatter.
"""
if generated_dataset_config:
assert isinstance(generated_dataset_config,
dict) and 'type' in generated_dataset_config
args = generated_dataset_config.copy()
obj_name = args.pop('type')
args.update(kwargs)

from .formatter import FORMATTERS
return FORMATTERS.modules[obj_name](**args)

formatter = MixtureFormatter(dataset_path=dataset_path,
text_keys=text_keys,
suffixes=suffixes,
Expand Down
11 changes: 8 additions & 3 deletions data_juicer/ops/filter/text_pair_similarity_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,17 +56,22 @@ def __init__(self,
self.model_key = prepare_model(model_type='huggingface',
pretrained_model_name_or_path=hf_clip,
trust_remote_code=trust_remote_code)
self.new_sample_key = ['target_text']

def compute_stats(self, sample, rank=None, context=False):

# check if it's computed already
if StatsKeys.text_pair_similarity in sample[Fields.stats]:
return sample

# there is no target text
for temp_new_key in self.new_sample_key:
if temp_new_key not in sample or len(sample[temp_new_key]) == 0:
raise ValueError(
f'Key \'{temp_new_key}\' is not found in sample. ')

# there is no text in this sample
if (self.text_key not in sample or 'target_text' not in sample
or len(sample[self.text_key]) == 0
or len(sample['target_text']) == 0):
if (self.text_key not in sample or len(sample[self.text_key]) == 0):
sample[Fields.stats][StatsKeys.text_pair_similarity] = np.array(
[], dtype=np.float64)
return sample
Expand Down
9 changes: 7 additions & 2 deletions data_juicer/ops/mapper/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@
from . import (audio_ffmpeg_wrapped_mapper, chinese_convert_mapper,
clean_copyright_mapper, clean_email_mapper, clean_html_mapper,
clean_ip_mapper, clean_links_mapper, expand_macro_mapper,
extract_qa_mapper, fix_unicode_mapper, image_blur_mapper,
extract_qa_mapper, fix_unicode_mapper,
generate_instruction_mapper, image_blur_mapper,
image_captioning_from_gpt4v_mapper, image_captioning_mapper,
image_diffusion_mapper, image_face_blur_mapper,
nlpaug_en_mapper, nlpcda_zh_mapper,
nlpaug_en_mapper, nlpcda_zh_mapper, optimize_instruction_mapper,
punctuation_normalization_mapper, remove_bibliography_mapper,
remove_comments_mapper, remove_header_mapper,
remove_long_words_mapper, remove_non_chinese_character_mapper,
Expand Down Expand Up @@ -34,13 +35,15 @@
from .expand_macro_mapper import ExpandMacroMapper
from .extract_qa_mapper import ExtractQAMapper
from .fix_unicode_mapper import FixUnicodeMapper
from .generate_instruction_mapper import GenerateInstructionMapper
from .image_blur_mapper import ImageBlurMapper
from .image_captioning_from_gpt4v_mapper import ImageCaptioningFromGPT4VMapper
from .image_captioning_mapper import ImageCaptioningMapper
from .image_diffusion_mapper import ImageDiffusionMapper
from .image_face_blur_mapper import ImageFaceBlurMapper
from .nlpaug_en_mapper import NlpaugEnMapper
from .nlpcda_zh_mapper import NlpcdaZhMapper
from .optimize_instruction_mapper import OptimizeInstructionMapper
from .punctuation_normalization_mapper import PunctuationNormalizationMapper
from .remove_bibliography_mapper import RemoveBibliographyMapper
from .remove_comments_mapper import RemoveCommentsMapper
Expand Down Expand Up @@ -92,6 +95,7 @@
'VideoFFmpegWrappedMapper',
'ChineseConvertMapper',
'NlpcdaZhMapper',
'OptimizeInstructionMapper',
'ImageBlurMapper',
'CleanCopyrightMapper',
'RemoveNonChineseCharacterlMapper',
Expand All @@ -108,6 +112,7 @@
'RemoveWordsWithIncorrectSubstringsMapper',
'VideoCaptioningFromVideoMapper',
'VideoCaptioningFromSummarizerMapper',
'GenerateInstructionMapper',
'FixUnicodeMapper',
'NlpaugEnMapper',
'VideoCaptioningFromFramesMapper',
Expand Down
Loading

0 comments on commit 6f79643

Please sign in to comment.