From 09880a38c4cf98c834344954b99926da9949e1d1 Mon Sep 17 00:00:00 2001 From: "jiangnana.jnn" Date: Tue, 10 Sep 2024 14:22:59 +0800 Subject: [PATCH 1/2] fix param def --- data_juicer/ops/mapper/extract_qa_mapper.py | 2 +- data_juicer/ops/mapper/generate_instruction_mapper.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/data_juicer/ops/mapper/extract_qa_mapper.py b/data_juicer/ops/mapper/extract_qa_mapper.py index 31767543f..db8a397f2 100644 --- a/data_juicer/ops/mapper/extract_qa_mapper.py +++ b/data_juicer/ops/mapper/extract_qa_mapper.py @@ -41,7 +41,7 @@ class ExtractQAMapper(Mapper): def __init__(self, hf_model: str = 'alibaba-pai/pai-qwen1_5-7b-doc2qa', - trust_remote_code=False, + trust_remote_code: bool = False, pattern: str = None, qa_format: str = 'chatml', enable_vllm: bool = True, diff --git a/data_juicer/ops/mapper/generate_instruction_mapper.py b/data_juicer/ops/mapper/generate_instruction_mapper.py index 92269554d..e42f697d5 100644 --- a/data_juicer/ops/mapper/generate_instruction_mapper.py +++ b/data_juicer/ops/mapper/generate_instruction_mapper.py @@ -51,9 +51,9 @@ class GenerateInstructionMapper(Mapper): _accelerator = 'cuda' def __init__(self, - hf_model, - seed_file, - instruct_num, + hf_model: str = 'Qwen/Qwen-7B-Chat', + seed_file: str = None, + instruct_num: int = 3, trust_remote_code: bool = False, similarity_threshold: float = 0.7, prompt_template: str = None, From 1db44c73c90c0c6e72fd18fa9e6a30f83e28deba Mon Sep 17 00:00:00 2001 From: "jiangnana.jnn" Date: Wed, 11 Sep 2024 14:20:50 +0800 Subject: [PATCH 2/2] add param check --- data_juicer/ops/mapper/generate_instruction_mapper.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/data_juicer/ops/mapper/generate_instruction_mapper.py b/data_juicer/ops/mapper/generate_instruction_mapper.py index e42f697d5..f75c54153 100644 --- a/data_juicer/ops/mapper/generate_instruction_mapper.py +++ b/data_juicer/ops/mapper/generate_instruction_mapper.py @@ -111,6 +111,10 @@ def __init__(self, super().__init__(*args, **kwargs) self.num_proc = 1 + if not seed_file: + raise ValueError('Please provide `seed_file` parameter, a file in chatml format. '\ + 'Reference data: data-juicer/demos/data/demo-dataset-chatml.jsonl ') + self.instruct_num = instruct_num self.similarity_threshold = similarity_threshold self.similarity_type = 'rouge_l'