diff --git a/data_juicer/ops/mapper/extract_qa_mapper.py b/data_juicer/ops/mapper/extract_qa_mapper.py index 31767543f..db8a397f2 100644 --- a/data_juicer/ops/mapper/extract_qa_mapper.py +++ b/data_juicer/ops/mapper/extract_qa_mapper.py @@ -41,7 +41,7 @@ class ExtractQAMapper(Mapper): def __init__(self, hf_model: str = 'alibaba-pai/pai-qwen1_5-7b-doc2qa', - trust_remote_code=False, + trust_remote_code: bool = False, pattern: str = None, qa_format: str = 'chatml', enable_vllm: bool = True, diff --git a/data_juicer/ops/mapper/generate_instruction_mapper.py b/data_juicer/ops/mapper/generate_instruction_mapper.py index 92269554d..f75c54153 100644 --- a/data_juicer/ops/mapper/generate_instruction_mapper.py +++ b/data_juicer/ops/mapper/generate_instruction_mapper.py @@ -51,9 +51,9 @@ class GenerateInstructionMapper(Mapper): _accelerator = 'cuda' def __init__(self, - hf_model, - seed_file, - instruct_num, + hf_model: str = 'Qwen/Qwen-7B-Chat', + seed_file: str = None, + instruct_num: int = 3, trust_remote_code: bool = False, similarity_threshold: float = 0.7, prompt_template: str = None, @@ -111,6 +111,10 @@ def __init__(self, super().__init__(*args, **kwargs) self.num_proc = 1 + if not seed_file: + raise ValueError('Please provide `seed_file` parameter, a file in chatml format. '\ + 'Reference data: data-juicer/demos/data/demo-dataset-chatml.jsonl ') + self.instruct_num = instruct_num self.similarity_threshold = similarity_threshold self.similarity_type = 'rouge_l'