From a64ffa593c04328f57378dff0652f33f11717650 Mon Sep 17 00:00:00 2001 From: Mustafa Eyceoz Date: Wed, 19 Jun 2024 14:52:24 -0400 Subject: [PATCH 1/9] First pass chat template (backend only) Signed-off-by: Mustafa Eyceoz --- .../chat_templates/ibm_generic_tmpl.py | 23 +++++++++++ .../training/chat_templates/mistral_tmpl.py | 23 +++++++++++ src/instructlab/training/data_process.py | 14 +------ src/instructlab/training/tokenizer_utils.py | 40 ++++++++----------- 4 files changed, 64 insertions(+), 36 deletions(-) create mode 100644 src/instructlab/training/chat_templates/ibm_generic_tmpl.py create mode 100644 src/instructlab/training/chat_templates/mistral_tmpl.py diff --git a/src/instructlab/training/chat_templates/ibm_generic_tmpl.py b/src/instructlab/training/chat_templates/ibm_generic_tmpl.py new file mode 100644 index 00000000..ba54e68d --- /dev/null +++ b/src/instructlab/training/chat_templates/ibm_generic_tmpl.py @@ -0,0 +1,23 @@ +from tokenizer_utils import SpecialTokens + +SPECIAL_TOKENS = SpecialTokens( + system="<|system|>", + user="<|user|>", + assistant="<|assistant|>", + eos="<|endoftext|>", + pad="<|pad|>" +) + +CHAT_TEMPLATE = ( + "{% for message in messages %}" + "{% if message['role'] == 'pretraining' %}" + "{{'<|endoftext|>' + message['content'] + '<|endoftext|>'}}" + "{% elif message['role'] == 'system' %}" + "{{'<|system|>'+ '\n' + message['content'] + '\n'}}" + "{% elif message['role'] == 'user' %}" + "{{'<|user|>' + '\n' + message['content'] + '\n'}}" + "{% elif message['role'] == 'assistant' %}" + "{{'<|assistant|>' + '\n' + message['content'] + '<|endoftext|>' + ('' if loop.last else '\n')}}" + "{% endif %}" + "{% endfor %}" +) \ No newline at end of file diff --git a/src/instructlab/training/chat_templates/mistral_tmpl.py b/src/instructlab/training/chat_templates/mistral_tmpl.py new file mode 100644 index 00000000..ba85e93d --- /dev/null +++ b/src/instructlab/training/chat_templates/mistral_tmpl.py @@ -0,0 +1,23 @@ +from tokenizer_utils import SpecialTokens + +SPECIAL_TOKENS = SpecialTokens( + bos="", + eos="", + user="[INST]", + assistant="[/INST]", + + +) + +CHAT_TEMPLATE = ( + "{{ '' }}" + "{% for message in messages %}" + "{% if message['role'] == 'pretraining' %}" + "{{ message['content'] + '' }}" + "{% elif message['role'] == 'user' %}" + "{{ '[INST] ' + message['content'] + ' [/INST]' }}" + "{% elif message['role'] == 'assistant' %}" + "{{ message['content'] + ''}}" + "{% endif %}" + "{% endfor %}" +) diff --git a/src/instructlab/training/data_process.py b/src/instructlab/training/data_process.py index f8f43535..9ea51a38 100644 --- a/src/instructlab/training/data_process.py +++ b/src/instructlab/training/data_process.py @@ -37,18 +37,6 @@ def check_valid_sample( if not any(token in whole_sentence_tk for token in special_tokens): return True - # first token should be system_token - if whole_sentence_tk[0] != system_tk: - print("\033[91mfirst token is not a system_token\033[0m") - log_rank_0(tokenizer.decode(whole_sentence_tk), to_print=True) - return False - - # check there's only one system_token - if (np.array(whole_sentence_tk) == system_tk).sum() != 1: - print("\033[91mthere are more than one system_token\033[0m") - log_rank_0(tokenizer.decode(whole_sentence_tk), to_print=True) - return False - whole_sentence_tk = np.array(whole_sentence_tk) user_token_index = (whole_sentence_tk == user_tk).nonzero()[0] assistant_token_index = (whole_sentence_tk == assistant_tk).nonzero()[0] @@ -121,7 +109,7 @@ def unmask_only_assistant_responses( whole_sentence = chosen_token["input_ids"][:sentence_legth].clone() # pre-training mode - if system_tk not in whole_sentence: + if not (system_tk in whole_sentence or user_token in whole_sentence or assist_token in whole_sentence): return labels labels[:sentence_legth] = -100 diff --git a/src/instructlab/training/tokenizer_utils.py b/src/instructlab/training/tokenizer_utils.py index 7eff0e69..c7e195ac 100644 --- a/src/instructlab/training/tokenizer_utils.py +++ b/src/instructlab/training/tokenizer_utils.py @@ -10,44 +10,38 @@ @dataclass class SpecialTokens: - system: str = field(default="<|system|>") + system: str = field(default=None) user: str = field(default="<|user|>") assistant: str = field(default="<|assistant|>") eos: str = field(default="<|endoftext|>") - pad: str = field(default="<|pad|>") + pad: str = field(default=None) + bos: str = field(default="<|begginingoftext|>") -SPECIAL_TOKENS = SpecialTokens() - -CHAT_TEMPLATE = ( - "{% for message in messages %}" - "{% if message['role'] == 'pretraining' %}" - "{{'<|endoftext|>' + message['content'] + '<|endoftext|>'}}" - "{% elif message['role'] == 'system' %}" - "{{'<|system|>'+ '\n' + message['content'] + '\n'}}" - "{% elif message['role'] == 'user' %}" - "{{'<|user|>' + '\n' + message['content'] + '\n'}}" - "{% elif message['role'] == 'assistant' %}" - "{{'<|assistant|>' + '\n' + message['content'] + '<|endoftext|>' + ('' if loop.last else '\n')}}" - "{% endif %}" - "{% endfor %}" -) +#TODO: Replace with specified template path +from instructlab.training.chat_templates.ibm_generic_tmpl import SPECIAL_TOKENS, CHAT_TEMPLATE def setup_tokenizer( model_name_or_path, SPECIAL_TOKENS=SPECIAL_TOKENS, CHAT_TEMPLATE=CHAT_TEMPLATE ): tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, fast_tokenizer=True) + + if not SPECIAL_TOKENS.pad: + SPECIAL_TOKENS.pad = SPECIAL_TOKENS.eos tokenizer.add_special_tokens( - {"eos_token": SPECIAL_TOKENS.eos, "pad_token": SPECIAL_TOKENS.pad} + {"bos_token": SPECIAL_TOKENS.bos, "eos_token": SPECIAL_TOKENS.eos, "pad_token": SPECIAL_TOKENS.pad} ) + + if SPECIAL_TOKENS.system: + add_token_list = [SPECIAL_TOKENS.system] + else: + add_token_list = [] + add_token_list.extend([SPECIAL_TOKENS.user, SPECIAL_TOKENS.assistant]) + tokenizer.add_special_tokens( { - "additional_special_tokens": [ - SPECIAL_TOKENS.system, - SPECIAL_TOKENS.user, - SPECIAL_TOKENS.assistant, - ] + "additional_special_tokens": add_token_list } ) if getattr(tokenizer, "add_bos_token", False) or getattr( From fa973a0f84e46256f5f96b5d833c8c6535311506 Mon Sep 17 00:00:00 2001 From: Mustafa Eyceoz Date: Wed, 19 Jun 2024 15:32:13 -0400 Subject: [PATCH 2/9] Module path importing Signed-off-by: Mustafa Eyceoz --- src/instructlab/training/tokenizer_utils.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/instructlab/training/tokenizer_utils.py b/src/instructlab/training/tokenizer_utils.py index c7e195ac..de6d1031 100644 --- a/src/instructlab/training/tokenizer_utils.py +++ b/src/instructlab/training/tokenizer_utils.py @@ -19,8 +19,15 @@ class SpecialTokens: #TODO: Replace with specified template path -from instructlab.training.chat_templates.ibm_generic_tmpl import SPECIAL_TOKENS, CHAT_TEMPLATE - +#from instructlab.training.chat_templates.ibm_generic_tmpl import SPECIAL_TOKENS, CHAT_TEMPLATE +import importlib.util +import sys +spec = importlib.util.spec_from_file_location("ibm_generic_tmpl", "chat_templates/ibm_generic_tmpl.py") +module = importlib.util.module_from_spec(spec) +sys.modules["ibm_generic_tmpl"] = module +spec.loader.exec_module(module) +SPECIAL_TOKENS = module.SPECIAL_TOKENS +CHAT_TEMPLATE = module.CHAT_TEMPLATE def setup_tokenizer( model_name_or_path, SPECIAL_TOKENS=SPECIAL_TOKENS, CHAT_TEMPLATE=CHAT_TEMPLATE From 0a699546ee88e8db1d25f17b112a1f0e4cd7a776 Mon Sep 17 00:00:00 2001 From: Mustafa Eyceoz Date: Thu, 20 Jun 2024 11:30:38 -0400 Subject: [PATCH 3/9] Fix missing sys handling Signed-off-by: Mustafa Eyceoz --- src/instructlab/training/data_process.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/instructlab/training/data_process.py b/src/instructlab/training/data_process.py index 9ea51a38..2cac9c73 100644 --- a/src/instructlab/training/data_process.py +++ b/src/instructlab/training/data_process.py @@ -196,7 +196,10 @@ def main(args: DataProcessArgs): eos_tk = get_sp_token(tokenizer, SPECIAL_TOKENS.eos) pad_tk = get_sp_token(tokenizer, SPECIAL_TOKENS.pad) - system_tk = get_sp_token(tokenizer, SPECIAL_TOKENS.system) + if SPECIAL_TOKENS.system: + system_tk = get_sp_token(tokenizer, SPECIAL_TOKENS.system) + else: + system_tk = None user_tk = get_sp_token(tokenizer, SPECIAL_TOKENS.user) assistant_tk = get_sp_token(tokenizer, SPECIAL_TOKENS.assistant) log_rank_0( From b1b11ebe136303a6f367f2ecef718482719e4ebd Mon Sep 17 00:00:00 2001 From: Mustafa Eyceoz Date: Thu, 20 Jun 2024 16:10:45 -0400 Subject: [PATCH 4/9] Connecting chat templates to the interface Signed-off-by: Mustafa Eyceoz --- .../chat_templates/ibm_generic_tmpl.py | 5 ++-- .../training/chat_templates/mistral_tmpl.py | 3 +-- src/instructlab/training/config.py | 4 +++ src/instructlab/training/data_process.py | 21 ++++++++++----- src/instructlab/training/main_ds.py | 11 +++++++- src/instructlab/training/tokenizer_utils.py | 27 +++++-------------- src/instructlab/training/utils.py | 14 ++++++++++ 7 files changed, 53 insertions(+), 32 deletions(-) diff --git a/src/instructlab/training/chat_templates/ibm_generic_tmpl.py b/src/instructlab/training/chat_templates/ibm_generic_tmpl.py index ba54e68d..6d4b37d7 100644 --- a/src/instructlab/training/chat_templates/ibm_generic_tmpl.py +++ b/src/instructlab/training/chat_templates/ibm_generic_tmpl.py @@ -1,3 +1,4 @@ +# Third Party from tokenizer_utils import SpecialTokens SPECIAL_TOKENS = SpecialTokens( @@ -5,7 +6,7 @@ user="<|user|>", assistant="<|assistant|>", eos="<|endoftext|>", - pad="<|pad|>" + pad="<|pad|>", ) CHAT_TEMPLATE = ( @@ -20,4 +21,4 @@ "{{'<|assistant|>' + '\n' + message['content'] + '<|endoftext|>' + ('' if loop.last else '\n')}}" "{% endif %}" "{% endfor %}" -) \ No newline at end of file +) diff --git a/src/instructlab/training/chat_templates/mistral_tmpl.py b/src/instructlab/training/chat_templates/mistral_tmpl.py index ba85e93d..753d5559 100644 --- a/src/instructlab/training/chat_templates/mistral_tmpl.py +++ b/src/instructlab/training/chat_templates/mistral_tmpl.py @@ -1,3 +1,4 @@ +# Third Party from tokenizer_utils import SpecialTokens SPECIAL_TOKENS = SpecialTokens( @@ -5,8 +6,6 @@ eos="", user="[INST]", assistant="[/INST]", - - ) CHAT_TEMPLATE = ( diff --git a/src/instructlab/training/config.py b/src/instructlab/training/config.py index c8733a45..8b93485d 100644 --- a/src/instructlab/training/config.py +++ b/src/instructlab/training/config.py @@ -42,6 +42,7 @@ class DataProcessArgs(BaseModel): data_output_path: str max_seq_len: int # defines the max sequence length of a sample model_path: str # either a HF model name or path to HF model + chat_tmpl_path: str # disable the protected namespace for the model_config field model_config = ConfigDict(protected_namespaces=()) @@ -100,6 +101,9 @@ class TrainingArgs(BaseModel): # Either the name of a HuggingFace model or a path to a model saved in HuggingFace format. model_path: str + # Specify the chat template / special tokens for training (default is ibm-generic template/tokens) + chat_tmpl_path: str = __file__ + "/chat_templates/ibm_generic_tmpl.py" + # this field specifies the filepath to the training dataset before processing data_path: str ckpt_output_dir: str diff --git a/src/instructlab/training/data_process.py b/src/instructlab/training/data_process.py index 2cac9c73..8d70ff34 100644 --- a/src/instructlab/training/data_process.py +++ b/src/instructlab/training/data_process.py @@ -10,12 +10,8 @@ # First Party from instructlab.training.config import DataProcessArgs -from instructlab.training.tokenizer_utils import ( - SPECIAL_TOKENS, - get_sp_token, - setup_tokenizer, -) -from instructlab.training.utils import log_rank_0, setup_logger +from instructlab.training.tokenizer_utils import get_sp_token, setup_tokenizer +from instructlab.training.utils import log_rank_0, retrieve_chat_template, setup_logger def check_valid_sample( @@ -109,7 +105,11 @@ def unmask_only_assistant_responses( whole_sentence = chosen_token["input_ids"][:sentence_legth].clone() # pre-training mode - if not (system_tk in whole_sentence or user_token in whole_sentence or assist_token in whole_sentence): + if not ( + system_tk in whole_sentence + or user_token in whole_sentence + or assist_token in whole_sentence + ): return labels labels[:sentence_legth] = -100 @@ -192,6 +192,7 @@ def remove_pretrain_system_messages(example: dict): def main(args: DataProcessArgs): + CHAT_TEMPLATE, SPECIAL_TOKENS = retrieve_chat_template(args.chat_tmpl_path) tokenizer = setup_tokenizer(args.model_path) eos_tk = get_sp_token(tokenizer, SPECIAL_TOKENS.eos) @@ -300,6 +301,12 @@ def main(args: DataProcessArgs): parser.add_argument( "--model_name_or_path", type=str, required=True, help="Model name or path" ) + parser.add_argument( + "--chat-tmpl-path", + type=str, + default=f"{__file__}/chat_templates/ibm_generic_tmpl.py", + help="Path to desired chat template and special tokens, defaults to IBM generic.", + ) args = parser.parse_args() setup_logger(args.logging_level) data_process_args = DataProcessArgs( diff --git a/src/instructlab/training/main_ds.py b/src/instructlab/training/main_ds.py index eeb0c077..08702549 100644 --- a/src/instructlab/training/main_ds.py +++ b/src/instructlab/training/main_ds.py @@ -37,6 +37,7 @@ patch_target_module, prepare_peft_model, prepare_universal_checkpoint_from_latest, + retrieve_chat_template, save_hf_format_ds, save_model_ds_native, set_random_seed, @@ -438,7 +439,8 @@ def main(args): print(f"\033[38;5;120m{yaml.dump(vars(args), sort_keys=False)}\033[0m") setup_logger(args.log_level) - tokenizer = setup_tokenizer(args.model_name_or_path) + CHAT_TEMPLATE, SPECIAL_TOKENS = retrieve_chat_template(args.chat_tmpl_path) + tokenizer = setup_tokenizer(args.model_name_or_path, CHAT_TEMPLATE, SPECIAL_TOKENS) # device = torch.device("cuda", args.local_rank) #### distributed init ##### @@ -522,6 +524,7 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs): model_path=train_args.model_path, data_path=train_args.data_path, max_seq_len=train_args.max_seq_len, + chat_tmpl_path=train_args.chat_tmpl_path, ) ) @@ -546,6 +549,7 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs): f"--log_level=INFO", f"--max_batch_len={train_args.max_batch_len}", f"--seed={train_args.random_seed}", + f"--chat-tmpl-path={train_args.chat_tmpl_path}", ] if train_args.mock_data: @@ -644,6 +648,11 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs): help="Offload optimizer to CPU when using DeepSpeed. This configures it to use ZeRO stage 2.", ) parser.add_argument("--NEFTune_alpha", type=float, default=None) + parser.add_argument( + "--chat-tmpl-path", + type=str, + default=f"{__file__}/chat_templates/ibm_generic_tmpl.py", + ) args = parser.parse_args() set_random_seed(args.seed) main(args) diff --git a/src/instructlab/training/tokenizer_utils.py b/src/instructlab/training/tokenizer_utils.py index de6d1031..5c789441 100644 --- a/src/instructlab/training/tokenizer_utils.py +++ b/src/instructlab/training/tokenizer_utils.py @@ -18,26 +18,17 @@ class SpecialTokens: bos: str = field(default="<|begginingoftext|>") -#TODO: Replace with specified template path -#from instructlab.training.chat_templates.ibm_generic_tmpl import SPECIAL_TOKENS, CHAT_TEMPLATE -import importlib.util -import sys -spec = importlib.util.spec_from_file_location("ibm_generic_tmpl", "chat_templates/ibm_generic_tmpl.py") -module = importlib.util.module_from_spec(spec) -sys.modules["ibm_generic_tmpl"] = module -spec.loader.exec_module(module) -SPECIAL_TOKENS = module.SPECIAL_TOKENS -CHAT_TEMPLATE = module.CHAT_TEMPLATE - -def setup_tokenizer( - model_name_or_path, SPECIAL_TOKENS=SPECIAL_TOKENS, CHAT_TEMPLATE=CHAT_TEMPLATE -): +def setup_tokenizer(model_name_or_path, SPECIAL_TOKENS, CHAT_TEMPLATE): tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, fast_tokenizer=True) if not SPECIAL_TOKENS.pad: SPECIAL_TOKENS.pad = SPECIAL_TOKENS.eos tokenizer.add_special_tokens( - {"bos_token": SPECIAL_TOKENS.bos, "eos_token": SPECIAL_TOKENS.eos, "pad_token": SPECIAL_TOKENS.pad} + { + "bos_token": SPECIAL_TOKENS.bos, + "eos_token": SPECIAL_TOKENS.eos, + "pad_token": SPECIAL_TOKENS.pad, + } ) if SPECIAL_TOKENS.system: @@ -46,11 +37,7 @@ def setup_tokenizer( add_token_list = [] add_token_list.extend([SPECIAL_TOKENS.user, SPECIAL_TOKENS.assistant]) - tokenizer.add_special_tokens( - { - "additional_special_tokens": add_token_list - } - ) + tokenizer.add_special_tokens({"additional_special_tokens": add_token_list}) if getattr(tokenizer, "add_bos_token", False) or getattr( tokenizer, "add_eos_token", False ): diff --git a/src/instructlab/training/utils.py b/src/instructlab/training/utils.py index 6feaa548..ce935c41 100644 --- a/src/instructlab/training/utils.py +++ b/src/instructlab/training/utils.py @@ -24,6 +24,20 @@ import torch.nn.functional as F +def retrieve_chat_template(chat_tmpl_path): + # Standard + import importlib.util + import sys + + spec = importlib.util.spec_from_file_location("spcl_chat_tmpl", chat_tmpl_path) + module = importlib.util.module_from_spec(spec) + sys.modules["spcl_chat_tmpl"] = module + spec.loader.exec_module(module) + SPECIAL_TOKENS = module.SPECIAL_TOKENS + CHAT_TEMPLATE = module.CHAT_TEMPLATE + return CHAT_TEMPLATE, SPECIAL_TOKENS + + def add_noisy_embeddings(model, noise_alpha=None): if not noise_alpha: return model From fbb6af6cd3053fcda8ae95c1f42e6103ce5b397e Mon Sep 17 00:00:00 2001 From: Mustafa Eyceoz Date: Fri, 21 Jun 2024 11:12:16 -0400 Subject: [PATCH 5/9] Fix file-paths Signed-off-by: Mustafa Eyceoz --- src/instructlab/training/config.py | 5 ++++- src/instructlab/training/data_process.py | 4 +++- src/instructlab/training/main_ds.py | 2 +- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/instructlab/training/config.py b/src/instructlab/training/config.py index 8b93485d..b989b18d 100644 --- a/src/instructlab/training/config.py +++ b/src/instructlab/training/config.py @@ -4,6 +4,7 @@ # Standard from enum import Enum +import os # Third Party from pydantic import BaseModel, ConfigDict, Field @@ -102,7 +103,9 @@ class TrainingArgs(BaseModel): model_path: str # Specify the chat template / special tokens for training (default is ibm-generic template/tokens) - chat_tmpl_path: str = __file__ + "/chat_templates/ibm_generic_tmpl.py" + chat_tmpl_path: str = ( + os.path.dirname(__file__) + "/chat_templates/ibm_generic_tmpl.py" + ) # this field specifies the filepath to the training dataset before processing data_path: str diff --git a/src/instructlab/training/data_process.py b/src/instructlab/training/data_process.py index 8d70ff34..d6c245f1 100644 --- a/src/instructlab/training/data_process.py +++ b/src/instructlab/training/data_process.py @@ -2,6 +2,7 @@ from pathlib import Path from typing import List import logging +import os # Third Party from datasets import load_dataset @@ -304,7 +305,7 @@ def main(args: DataProcessArgs): parser.add_argument( "--chat-tmpl-path", type=str, - default=f"{__file__}/chat_templates/ibm_generic_tmpl.py", + default=f"{os.path.dirname(__file__)}/../chat_templates/ibm_generic_tmpl.py", help="Path to desired chat template and special tokens, defaults to IBM generic.", ) args = parser.parse_args() @@ -314,6 +315,7 @@ def main(args: DataProcessArgs): data_path=args.data_path, max_seq_len=args.max_seq_len, model_path=args.model_name_or_path, + chat_tmpl_path=args.chat_tmpl_path, ) main(data_process_args) diff --git a/src/instructlab/training/main_ds.py b/src/instructlab/training/main_ds.py index 08702549..1dedf513 100644 --- a/src/instructlab/training/main_ds.py +++ b/src/instructlab/training/main_ds.py @@ -651,7 +651,7 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs): parser.add_argument( "--chat-tmpl-path", type=str, - default=f"{__file__}/chat_templates/ibm_generic_tmpl.py", + default=f"{os.path.dirname(__file__)}/chat_templates/ibm_generic_tmpl.py", ) args = parser.parse_args() set_random_seed(args.seed) From 07b6662d76b44877692fd9d8e73463401a005f4f Mon Sep 17 00:00:00 2001 From: Mustafa Eyceoz Date: Fri, 21 Jun 2024 11:18:46 -0400 Subject: [PATCH 6/9] Typo fixes Signed-off-by: Mustafa Eyceoz --- src/instructlab/training/data_process.py | 4 ++-- src/instructlab/training/main_ds.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/instructlab/training/data_process.py b/src/instructlab/training/data_process.py index d6c245f1..9bbe0a5f 100644 --- a/src/instructlab/training/data_process.py +++ b/src/instructlab/training/data_process.py @@ -194,7 +194,7 @@ def remove_pretrain_system_messages(example: dict): def main(args: DataProcessArgs): CHAT_TEMPLATE, SPECIAL_TOKENS = retrieve_chat_template(args.chat_tmpl_path) - tokenizer = setup_tokenizer(args.model_path) + tokenizer = setup_tokenizer(args.model_path, SPECIAL_TOKENS, CHAT_TEMPLATE) eos_tk = get_sp_token(tokenizer, SPECIAL_TOKENS.eos) pad_tk = get_sp_token(tokenizer, SPECIAL_TOKENS.pad) @@ -305,7 +305,7 @@ def main(args: DataProcessArgs): parser.add_argument( "--chat-tmpl-path", type=str, - default=f"{os.path.dirname(__file__)}/../chat_templates/ibm_generic_tmpl.py", + default=f"{os.path.dirname(__file__)}/chat_templates/ibm_generic_tmpl.py", help="Path to desired chat template and special tokens, defaults to IBM generic.", ) args = parser.parse_args() diff --git a/src/instructlab/training/main_ds.py b/src/instructlab/training/main_ds.py index 1dedf513..451413fc 100644 --- a/src/instructlab/training/main_ds.py +++ b/src/instructlab/training/main_ds.py @@ -440,7 +440,7 @@ def main(args): setup_logger(args.log_level) CHAT_TEMPLATE, SPECIAL_TOKENS = retrieve_chat_template(args.chat_tmpl_path) - tokenizer = setup_tokenizer(args.model_name_or_path, CHAT_TEMPLATE, SPECIAL_TOKENS) + tokenizer = setup_tokenizer(args.model_name_or_path, SPECIAL_TOKENS, CHAT_TEMPLATE) # device = torch.device("cuda", args.local_rank) #### distributed init ##### From 0db67f64595bb4e08bbaf8af1f3887088a957775 Mon Sep 17 00:00:00 2001 From: Mustafa Eyceoz Date: Fri, 21 Jun 2024 11:55:13 -0400 Subject: [PATCH 7/9] Fixed template imports Signed-off-by: Mustafa Eyceoz --- .../training/chat_templates/ibm_generic_tmpl.py | 4 ++-- .../training/chat_templates/mistral_tmpl.py | 4 ++-- src/instructlab/training/utils.py | 15 +++++++++------ 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/src/instructlab/training/chat_templates/ibm_generic_tmpl.py b/src/instructlab/training/chat_templates/ibm_generic_tmpl.py index 6d4b37d7..87bfdb0a 100644 --- a/src/instructlab/training/chat_templates/ibm_generic_tmpl.py +++ b/src/instructlab/training/chat_templates/ibm_generic_tmpl.py @@ -1,5 +1,5 @@ -# Third Party -from tokenizer_utils import SpecialTokens +# First Party +from instructlab.training.tokenizer_utils import SpecialTokens SPECIAL_TOKENS = SpecialTokens( system="<|system|>", diff --git a/src/instructlab/training/chat_templates/mistral_tmpl.py b/src/instructlab/training/chat_templates/mistral_tmpl.py index 753d5559..965823f2 100644 --- a/src/instructlab/training/chat_templates/mistral_tmpl.py +++ b/src/instructlab/training/chat_templates/mistral_tmpl.py @@ -1,5 +1,5 @@ -# Third Party -from tokenizer_utils import SpecialTokens +# First Party +from instructlab.training.tokenizer_utils import SpecialTokens SPECIAL_TOKENS = SpecialTokens( bos="", diff --git a/src/instructlab/training/utils.py b/src/instructlab/training/utils.py index ce935c41..9e57dc68 100644 --- a/src/instructlab/training/utils.py +++ b/src/instructlab/training/utils.py @@ -29,12 +29,15 @@ def retrieve_chat_template(chat_tmpl_path): import importlib.util import sys - spec = importlib.util.spec_from_file_location("spcl_chat_tmpl", chat_tmpl_path) - module = importlib.util.module_from_spec(spec) - sys.modules["spcl_chat_tmpl"] = module - spec.loader.exec_module(module) - SPECIAL_TOKENS = module.SPECIAL_TOKENS - CHAT_TEMPLATE = module.CHAT_TEMPLATE + try: + spec = importlib.util.spec_from_file_location("spcl_chat_tmpl", chat_tmpl_path) + module = importlib.util.module_from_spec(spec) + sys.modules["spcl_chat_tmpl"] = module + spec.loader.exec_module(module) + SPECIAL_TOKENS = module.SPECIAL_TOKENS + CHAT_TEMPLATE = module.CHAT_TEMPLATE + except: + sys.exit(f"Invalid chat template path: {chat_tmpl_path}") return CHAT_TEMPLATE, SPECIAL_TOKENS From 16b9e88f25c4063fb96023f67e27852e76bf93a3 Mon Sep 17 00:00:00 2001 From: Mustafa Eyceoz Date: Fri, 21 Jun 2024 14:11:42 -0400 Subject: [PATCH 8/9] Review feedback applied Signed-off-by: Mustafa Eyceoz --- src/instructlab/training/config.py | 4 ++-- src/instructlab/training/data_process.py | 4 +++- src/instructlab/training/main_ds.py | 4 +++- src/instructlab/training/utils.py | 4 ---- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/instructlab/training/config.py b/src/instructlab/training/config.py index b989b18d..7ea000b8 100644 --- a/src/instructlab/training/config.py +++ b/src/instructlab/training/config.py @@ -103,8 +103,8 @@ class TrainingArgs(BaseModel): model_path: str # Specify the chat template / special tokens for training (default is ibm-generic template/tokens) - chat_tmpl_path: str = ( - os.path.dirname(__file__) + "/chat_templates/ibm_generic_tmpl.py" + chat_tmpl_path: str = os.path.join( + os.path.dirname(__file__), "/chat_templates/ibm_generic_tmpl.py" ) # this field specifies the filepath to the training dataset before processing diff --git a/src/instructlab/training/data_process.py b/src/instructlab/training/data_process.py index 9bbe0a5f..9301d185 100644 --- a/src/instructlab/training/data_process.py +++ b/src/instructlab/training/data_process.py @@ -305,7 +305,9 @@ def main(args: DataProcessArgs): parser.add_argument( "--chat-tmpl-path", type=str, - default=f"{os.path.dirname(__file__)}/chat_templates/ibm_generic_tmpl.py", + default=os.path.join( + os.path.dirname(__file__), "chat_templates/ibm_generic_tmpl.py" + ), help="Path to desired chat template and special tokens, defaults to IBM generic.", ) args = parser.parse_args() diff --git a/src/instructlab/training/main_ds.py b/src/instructlab/training/main_ds.py index 451413fc..1eb21a68 100644 --- a/src/instructlab/training/main_ds.py +++ b/src/instructlab/training/main_ds.py @@ -651,7 +651,9 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs): parser.add_argument( "--chat-tmpl-path", type=str, - default=f"{os.path.dirname(__file__)}/chat_templates/ibm_generic_tmpl.py", + default=os.path.join( + os.path.dirname(__file__), "chat_templates/ibm_generic_tmpl.py" + ), ) args = parser.parse_args() set_random_seed(args.seed) diff --git a/src/instructlab/training/utils.py b/src/instructlab/training/utils.py index 9e57dc68..5eccb5dc 100644 --- a/src/instructlab/training/utils.py +++ b/src/instructlab/training/utils.py @@ -25,10 +25,6 @@ def retrieve_chat_template(chat_tmpl_path): - # Standard - import importlib.util - import sys - try: spec = importlib.util.spec_from_file_location("spcl_chat_tmpl", chat_tmpl_path) module = importlib.util.module_from_spec(spec) From d0dcef053307720bf85a7f89b6ee762cf7c76353 Mon Sep 17 00:00:00 2001 From: Mustafa Eyceoz Date: Fri, 21 Jun 2024 14:21:04 -0400 Subject: [PATCH 9/9] Remove extraneous slash Signed-off-by: Mustafa Eyceoz --- src/instructlab/training/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/instructlab/training/config.py b/src/instructlab/training/config.py index 7ea000b8..83c7a1f8 100644 --- a/src/instructlab/training/config.py +++ b/src/instructlab/training/config.py @@ -104,7 +104,7 @@ class TrainingArgs(BaseModel): # Specify the chat template / special tokens for training (default is ibm-generic template/tokens) chat_tmpl_path: str = os.path.join( - os.path.dirname(__file__), "/chat_templates/ibm_generic_tmpl.py" + os.path.dirname(__file__), "chat_templates/ibm_generic_tmpl.py" ) # this field specifies the filepath to the training dataset before processing