diff --git a/configs/exp/train/classification/classification.json b/configs/exp/train/classification/classification.json index af851ed..5257230 100755 --- a/configs/exp/train/classification/classification.json +++ b/configs/exp/train/classification/classification.json @@ -86,6 +86,11 @@ } }, "tokenizer_settings": {}, + "special_tokens_settings": { + "bos_token": "", + "eos_token": "", + "pad_token": "" + }, "trainer_settings": { "evaluation_strategy": "steps", "per_device_train_batch_size": 1, diff --git a/configs/exp/train/dpo/dpo.json b/configs/exp/train/dpo/dpo.json index 4e4dc21..61bb787 100755 --- a/configs/exp/train/dpo/dpo.json +++ b/configs/exp/train/dpo/dpo.json @@ -96,6 +96,10 @@ } }, "tokenizer_settings": {}, + "special_tokens_settings": { + "bos_token": "<|begin_of_text|>", + "eos_token": "<|end_of_text|>" + }, "trainer_settings": { "evaluation_strategy": "steps", "per_device_train_batch_size": 1, diff --git a/configs/exp/train/multimodal/c_abs.json b/configs/exp/train/multimodal/c_abs.json index 713abeb..46f00f7 100644 --- a/configs/exp/train/multimodal/c_abs.json +++ b/configs/exp/train/multimodal/c_abs.json @@ -105,6 +105,10 @@ "tokenizer_settings": { "tokenizer_path": "/from_s3/model" }, + "special_tokens_settings": { + "bos_token": "", + "eos_token": "" + }, "trainer_settings": { "evaluation_strategy": "epoch", "save_strategy": "epoch", diff --git a/configs/exp/train/multimodal/llava.json b/configs/exp/train/multimodal/llava.json deleted file mode 100644 index e69de29..0000000 diff --git a/configs/exp/train/multimodal/mlp.json b/configs/exp/train/multimodal/mlp.json index afb2f2f..68a8925 100644 --- a/configs/exp/train/multimodal/mlp.json +++ b/configs/exp/train/multimodal/mlp.json @@ -105,6 +105,10 @@ "tokenizer_settings": { "tokenizer_path": "/from_s3/model" }, + "special_tokens_settings": { + "bos_token": "", + "eos_token": "" + }, "trainer_settings": { "evaluation_strategy": "epoch", "save_strategy": "epoch", diff --git a/configs/exp/train/rag/end2end_rag.json b/configs/exp/train/rag/end2end_rag.json index 28c109a..e0b5cad 100755 --- a/configs/exp/train/rag/end2end_rag.json +++ b/configs/exp/train/rag/end2end_rag.json @@ -112,6 +112,10 @@ "metric_settings": [] }, "tokenizer_settings": {}, + "special_tokens_settings": { + "bos_token": "", + "eos_token": "" + }, "trainer_settings": { "evaluation_strategy": "steps", "save_strategy": "steps", diff --git a/configs/exp/train/rm/rm.json b/configs/exp/train/rm/rm.json index f46d2f0..b9bd08c 100755 --- a/configs/exp/train/rm/rm.json +++ b/configs/exp/train/rm/rm.json @@ -83,6 +83,10 @@ } }, "tokenizer_settings": {}, + "special_tokens_settings": { + "bos_token": "<|begin_of_text|>", + "eos_token": "<|end_of_text|>" + }, "trainer_settings": { "evaluation_strategy": "steps", "per_device_train_batch_size": 1, diff --git a/configs/exp/train/sft/sft.json b/configs/exp/train/sft/sft.json index 14f5606..f746e83 100755 --- a/configs/exp/train/sft/sft.json +++ b/configs/exp/train/sft/sft.json @@ -104,6 +104,10 @@ ] }, "tokenizer_settings": {}, + "special_tokens_settings": { + "bos_token": "<|begin_of_text|>", + "eos_token": "<|end_of_text|>" +}, "trainer_settings": { "evaluation_strategy": "steps", "save_total_limit": 5, diff --git a/tests/fixtures/configs/train/classification/base.json b/tests/fixtures/configs/train/classification/base.json index b586c59..c1f4158 100755 --- a/tests/fixtures/configs/train/classification/base.json +++ b/tests/fixtures/configs/train/classification/base.json @@ -87,6 +87,11 @@ } }, "tokenizer_settings": {}, + "special_tokens_settings": { + "bos_token": "", + "eos_token": "", + "pad_token": "" + }, "trainer_settings": { "evaluation_strategy": "steps", "per_device_train_batch_size": 1, diff --git a/tests/fixtures/configs/train/ddpo/base.json b/tests/fixtures/configs/train/ddpo/base.json index 7da3ff8..57e6b2d 100755 --- a/tests/fixtures/configs/train/ddpo/base.json +++ b/tests/fixtures/configs/train/ddpo/base.json @@ -127,6 +127,11 @@ }, "chat_tokenizer_settings": {}, "rm_tokenizer_settings": {}, + "special_tokens_settings": { + "bos_token": "", + "eos_token": "", + "pad_token": "" + }, "trainer_settings": { "evaluation_strategy": "steps", "per_device_train_batch_size": 1, diff --git a/tests/fixtures/configs/train/dpo/base.json b/tests/fixtures/configs/train/dpo/base.json index a06caad..514d462 100755 --- a/tests/fixtures/configs/train/dpo/base.json +++ b/tests/fixtures/configs/train/dpo/base.json @@ -106,6 +106,11 @@ ] }, "tokenizer_settings": {}, + "special_tokens_settings": { + "bos_token": "", + "eos_token": "", + "pad_token": "" + }, "trainer_settings": { "evaluation_strategy": "steps", "per_device_train_batch_size": 2, diff --git a/tests/fixtures/configs/train/dpo/simpo.json b/tests/fixtures/configs/train/dpo/simpo.json index 593463d..01233b8 100755 --- a/tests/fixtures/configs/train/dpo/simpo.json +++ b/tests/fixtures/configs/train/dpo/simpo.json @@ -98,6 +98,11 @@ ] }, "tokenizer_settings": {}, + "special_tokens_settings": { + "bos_token": "", + "eos_token": "", + "pad_token": "" + }, "trainer_settings": { "evaluation_strategy": "steps", "per_device_train_batch_size": 2, diff --git a/tests/fixtures/configs/train/kto/base.json b/tests/fixtures/configs/train/kto/base.json index 28dc7a4..af53a14 100755 --- a/tests/fixtures/configs/train/kto/base.json +++ b/tests/fixtures/configs/train/kto/base.json @@ -84,6 +84,11 @@ "metric_settings": [] }, "tokenizer_settings": {}, + "special_tokens_settings": { + "bos_token": "", + "eos_token": "", + "pad_token": "" + }, "trainer_settings": { "evaluation_strategy": "steps", "per_device_train_batch_size": 4, diff --git a/tests/fixtures/configs/train/multimodal/llama_c_abs_clip_pickle.json b/tests/fixtures/configs/train/multimodal/llama_c_abs_clip_pickle.json index 0195034..49c2b9c 100644 --- a/tests/fixtures/configs/train/multimodal/llama_c_abs_clip_pickle.json +++ b/tests/fixtures/configs/train/multimodal/llama_c_abs_clip_pickle.json @@ -103,6 +103,11 @@ } }, "tokenizer_settings": {}, + "special_tokens_settings": { + "bos_token": "", + "eos_token": "", + "pad_token": "" + }, "trainer_settings": { "evaluation_strategy": "epoch", "save_strategy": "epoch", diff --git a/tests/fixtures/configs/train/multimodal/llama_llava_base_clip.json b/tests/fixtures/configs/train/multimodal/llama_llava_base_clip.json index 98fbad8..01b5dce 100644 --- a/tests/fixtures/configs/train/multimodal/llama_llava_base_clip.json +++ b/tests/fixtures/configs/train/multimodal/llama_llava_base_clip.json @@ -103,6 +103,11 @@ } }, "tokenizer_settings": {}, + "special_tokens_settings": { + "bos_token": "", + "eos_token": "", + "pad_token": "" + }, "trainer_settings": { "evaluation_strategy": "epoch", "save_strategy": "epoch", diff --git a/tests/fixtures/configs/train/multimodal/llama_llava_clip_pickle.json b/tests/fixtures/configs/train/multimodal/llama_llava_clip_pickle.json index d7a46e7..a16a69c 100644 --- a/tests/fixtures/configs/train/multimodal/llama_llava_clip_pickle.json +++ b/tests/fixtures/configs/train/multimodal/llama_llava_clip_pickle.json @@ -103,6 +103,11 @@ } }, "tokenizer_settings": {}, + "special_tokens_settings": { + "bos_token": "", + "eos_token": "", + "pad_token": "" + }, "trainer_settings": { "evaluation_strategy": "epoch", "save_strategy": "epoch", diff --git a/tests/fixtures/configs/train/rag/base.json b/tests/fixtures/configs/train/rag/base.json index f11b4b8..5fc808e 100755 --- a/tests/fixtures/configs/train/rag/base.json +++ b/tests/fixtures/configs/train/rag/base.json @@ -118,6 +118,11 @@ "metric_settings": [] }, "tokenizer_settings": {}, + "special_tokens_settings": { + "bos_token": "", + "eos_token": "", + "pad_token": "" +}, "trainer_settings": { "evaluation_strategy": "epoch", "save_strategy": "epoch", diff --git a/tests/fixtures/configs/train/rm/base.json b/tests/fixtures/configs/train/rm/base.json index b4bede7..e3fed58 100755 --- a/tests/fixtures/configs/train/rm/base.json +++ b/tests/fixtures/configs/train/rm/base.json @@ -89,6 +89,11 @@ } }, "tokenizer_settings": {}, + "special_tokens_settings": { + "bos_token": "", + "eos_token": "", + "pad_token": "" + }, "trainer_settings": { "evaluation_strategy": "steps", "per_device_train_batch_size": 1, diff --git a/tests/fixtures/configs/train/sft/base.json b/tests/fixtures/configs/train/sft/base.json index b5dc9fd..1a48d35 100755 --- a/tests/fixtures/configs/train/sft/base.json +++ b/tests/fixtures/configs/train/sft/base.json @@ -99,6 +99,11 @@ "metric_settings": [] }, "tokenizer_settings": {}, + "special_tokens_settings": { + "bos_token": "", + "eos_token": "", + "pad_token": "" + }, "trainer_settings": { "evaluation_strategy": "steps", "per_device_train_batch_size": 1, diff --git a/tests/fixtures/configs/train/sft/prompt_tuning.json b/tests/fixtures/configs/train/sft/prompt_tuning.json index c187176..ca3eb88 100755 --- a/tests/fixtures/configs/train/sft/prompt_tuning.json +++ b/tests/fixtures/configs/train/sft/prompt_tuning.json @@ -94,6 +94,11 @@ "metric_settings": [] }, "tokenizer_settings": {}, + "special_tokens_settings": { + "bos_token": "", + "eos_token": "", + "pad_token": "" + }, "trainer_settings": { "evaluation_strategy": "steps", "per_device_train_batch_size": 1, diff --git a/tests/fixtures/configs/train/sft/resume_from_checkpoint.json b/tests/fixtures/configs/train/sft/resume_from_checkpoint.json index 95336ba..f7e75a4 100755 --- a/tests/fixtures/configs/train/sft/resume_from_checkpoint.json +++ b/tests/fixtures/configs/train/sft/resume_from_checkpoint.json @@ -78,6 +78,11 @@ "metric_settings": [] }, "tokenizer_settings": {}, + "special_tokens_settings": { + "bos_token": "", + "eos_token": "", + "pad_token": "" + }, "trainer_settings": { "evaluation_strategy": "steps", "per_device_train_batch_size": 1, diff --git a/tests/fixtures/configs/train/sft/sft_with_rm_metric.json b/tests/fixtures/configs/train/sft/sft_with_rm_metric.json index c281971..ee22909 100755 --- a/tests/fixtures/configs/train/sft/sft_with_rm_metric.json +++ b/tests/fixtures/configs/train/sft/sft_with_rm_metric.json @@ -121,6 +121,11 @@ ] }, "tokenizer_settings": {}, + "special_tokens_settings": { + "bos_token": "", + "eos_token": "", + "pad_token": "" + }, "trainer_settings": { "evaluation_strategy": "steps", "per_device_train_batch_size": 1, diff --git a/turbo_alignment/common/tf/special_tokens_setter.py b/turbo_alignment/common/tf/special_tokens_setter.py index 3ceb313..b1aaeb6 100755 --- a/turbo_alignment/common/tf/special_tokens_setter.py +++ b/turbo_alignment/common/tf/special_tokens_setter.py @@ -2,16 +2,18 @@ from transformers.tokenization_utils_base import PreTrainedTokenizerBase from turbo_alignment.common.logging import get_project_logger +from turbo_alignment.settings.tf.special_tokens_setter import SpecialTokensSettings logger = get_project_logger() class SpecialTokensSetter: - def __init__(self, tokenizer: PreTrainedTokenizerBase) -> None: + def __init__(self, tokenizer: PreTrainedTokenizerBase, special_tokens_settings: SpecialTokensSettings) -> None: self._tokenizer = tokenizer + self._special_tokens_settings = special_tokens_settings self._special_tokens_already_set: bool = False - def setBOS(self, bos_token: str = '') -> None: + def setBOS(self, bos_token: str) -> None: if self._tokenizer.bos_token_id is None: logger.info('Model does not have bos_token_id') self._tokenizer.add_special_tokens(special_tokens_dict={'bos_token': bos_token}) @@ -20,7 +22,7 @@ def setBOS(self, bos_token: str = '') -> None: else: logger.info(f'Model has bos_token_id = {self._tokenizer.bos_token_id}') - def setEOS(self, eos_token: str = '') -> None: + def setEOS(self, eos_token: str) -> None: if self._tokenizer.eos_token_id is None: logger.info('Model does not have eos_token_id') self._tokenizer.add_special_tokens(special_tokens_dict={'eos_token': eos_token}) @@ -29,7 +31,11 @@ def setEOS(self, eos_token: str = '') -> None: else: logger.info(f'Model has eos_token_id = {self._tokenizer.eos_token_id}') - def setPAD(self, pad_token: str = '') -> None: + def setPAD(self, pad_token: str | None) -> None: + if self._tokenizer.pad_token_id is None and pad_token is None: + logger.info('Skip adding pad_token_id') + return None + if self._tokenizer.pad_token_id is None: logger.info('Model does not have pad_token_id') self._tokenizer.add_special_tokens(special_tokens_dict={'pad_token': pad_token}) @@ -38,7 +44,13 @@ def setPAD(self, pad_token: str = '') -> None: else: logger.info(f'Model has pad_token_id = {self._tokenizer.pad_token_id}') - def setUNK(self, unk_token: str = '') -> None: + return None + + def setUNK(self, unk_token: str | None) -> None: + if self._tokenizer.unk_token_id is None and unk_token is None: + logger.info('Skip adding sep_token_id') + return None + if self._tokenizer.unk_token_id is None: logger.info('Model does not have unk_token_id') self._tokenizer.add_special_tokens(special_tokens_dict={'unk_token': unk_token}) @@ -47,7 +59,13 @@ def setUNK(self, unk_token: str = '') -> None: else: logger.info(f'Model has unk_token_id = {self._tokenizer.unk_token_id}') - def setSEP(self, sep_token: str = '') -> None: + return None + + def setSEP(self, sep_token: str | None) -> None: + if self._tokenizer.sep_token_id is None and sep_token is None: + logger.info('Skip adding sep_token_id') + return None + if self._tokenizer.sep_token_id is None: logger.info('Model does not have sep_token_id') self._tokenizer.add_special_tokens(special_tokens_dict={'sep_token': sep_token}) @@ -56,12 +74,14 @@ def setSEP(self, sep_token: str = '') -> None: else: logger.info(f'Model has sep_token_id = {self._tokenizer.sep_token_id}') - def set_all(self): - self.setBOS() - self.setEOS() - self.setPAD() - self.setUNK() - self.setSEP() + return None + + def set_all(self) -> None: + self.setBOS(bos_token=self._special_tokens_settings.bos_token) + self.setEOS(eos_token=self._special_tokens_settings.eos_token) + self.setPAD(pad_token=self._special_tokens_settings.pad_token) + self.setUNK(unk_token=self._special_tokens_settings.unk_token) + self.setSEP(sep_token=self._special_tokens_settings.sep_token) def set_custom_tokens(self, tokens: list[str]) -> None: if self._special_tokens_already_set: @@ -73,8 +93,11 @@ def set_custom_tokens(self, tokens: list[str]) -> None: added_tokens = self._tokenizer.add_special_tokens({'additional_special_tokens': tokens}) assert added_tokens == len(tokens) - def setup_model_config(self, model: PreTrainedModel): - model.config.eos_token_id = self._tokenizer.eos_token_id + def setup_model_config(self, model: PreTrainedModel) -> None: model.config.bos_token_id = self._tokenizer.bos_token_id - model.config.pad_token_id = self._tokenizer.pad_token_id - model.config.sep_token_id = self._tokenizer.sep_token_id + model.config.eos_token_id = self._tokenizer.eos_token_id + + if self._tokenizer.pad_token_id is not None: + model.config.pad_token_id = self._tokenizer.pad_token_id + if self._tokenizer.sep_token_id is not None: + model.config.sep_token_id = self._tokenizer.sep_token_id diff --git a/turbo_alignment/pipelines/train/base.py b/turbo_alignment/pipelines/train/base.py index 20bd8c8..57e4a20 100755 --- a/turbo_alignment/pipelines/train/base.py +++ b/turbo_alignment/pipelines/train/base.py @@ -145,7 +145,7 @@ def run(self, experiment_settings: ExperimentSettingsT) -> None: additional_special_tokens = self._get_additional_special_tokens(experiment_settings) logger.info(f'Special tokens: {additional_special_tokens}') - special_tokens_setter = SpecialTokensSetter(self.tokenizer) + special_tokens_setter = SpecialTokensSetter(self.tokenizer, experiment_settings.special_tokens_settings) special_tokens_setter.set_all() special_tokens_setter.set_custom_tokens(additional_special_tokens) diff --git a/turbo_alignment/pipelines/train/ddpo.py b/turbo_alignment/pipelines/train/ddpo.py index a280b6f..2657fa1 100755 --- a/turbo_alignment/pipelines/train/ddpo.py +++ b/turbo_alignment/pipelines/train/ddpo.py @@ -129,14 +129,14 @@ def run(self, experiment_settings: DDPOTrainExperimentSettings) -> None: additional_special_tokens = self._get_additional_special_tokens(experiment_settings) logger.info(f'Special tokens: {additional_special_tokens}') - special_tokens_setter = SpecialTokensSetter(self.rm_tokenizer) + special_tokens_setter = SpecialTokensSetter(self.rm_tokenizer, experiment_settings.special_tokens_settings) special_tokens_setter.set_all() special_tokens_setter.set_custom_tokens(additional_special_tokens) logger.info('RM Special tokens added!') logger.info(f'Special tokens: {additional_special_tokens}') - special_tokens_setter = SpecialTokensSetter(self.tokenizer) + special_tokens_setter = SpecialTokensSetter(self.tokenizer, experiment_settings.special_tokens_settings) special_tokens_setter.set_all() special_tokens_setter.set_custom_tokens(additional_special_tokens) diff --git a/turbo_alignment/settings/pipelines/train/base.py b/turbo_alignment/settings/pipelines/train/base.py index 7ed9d66..7e2173e 100755 --- a/turbo_alignment/settings/pipelines/train/base.py +++ b/turbo_alignment/settings/pipelines/train/base.py @@ -11,6 +11,7 @@ PreTrainedModelSettings, ) from turbo_alignment.settings.s3 import CheckpointUploaderCallbackParameters +from turbo_alignment.settings.tf.special_tokens_setter import SpecialTokensSettings from turbo_alignment.settings.tf.tokenizer import TokenizerSettings from turbo_alignment.settings.tf.trainer import TrainerSettings from turbo_alignment.settings.weights_and_biases import WandbSettings @@ -26,7 +27,9 @@ class BaseTrainExperimentSettings(BaseSettings): seed: int = 42 trainer_settings: TrainerSettings + tokenizer_settings: TokenizerSettings + special_tokens_settings: SpecialTokensSettings model_settings: (ModelForPeftSettings | PreTrainedModelSettings | PreTrainedAdaptersModelSettings) diff --git a/turbo_alignment/settings/tf/special_tokens_setter.py b/turbo_alignment/settings/tf/special_tokens_setter.py new file mode 100755 index 0000000..fc6ab7e --- /dev/null +++ b/turbo_alignment/settings/tf/special_tokens_setter.py @@ -0,0 +1,9 @@ +from turbo_alignment.settings.base import ExtraFieldsNotAllowedBaseModel + + +class SpecialTokensSettings(ExtraFieldsNotAllowedBaseModel): + bos_token: str + eos_token: str + pad_token: str | None = None + unk_token: str | None = None + sep_token: str | None = None diff --git a/tutorials/dpo/dpo.json b/tutorials/dpo/dpo.json index 29c2bf0..7948c6e 100755 --- a/tutorials/dpo/dpo.json +++ b/tutorials/dpo/dpo.json @@ -93,6 +93,10 @@ "transformers_settings": {} }, "tokenizer_settings": {}, + "special_tokens_settings": { + "bos_token": "<|begin_of_text|>", + "eos_token": "<|end_of_text|>" + }, "trainer_settings": { "evaluation_strategy": "steps", "per_device_train_batch_size": 1, diff --git a/tutorials/kto/kto.json b/tutorials/kto/kto.json index 828eaac..9457b17 100644 --- a/tutorials/kto/kto.json +++ b/tutorials/kto/kto.json @@ -91,6 +91,10 @@ "transformers_settings": {} }, "tokenizer_settings": {}, + "special_tokens_settings": { + "bos_token": "<|begin_of_text|>", + "eos_token": "<|end_of_text|>" + }, "trainer_settings": { "evaluation_strategy": "steps", "per_device_train_batch_size": 1, diff --git a/tutorials/multimodal/multimodal.json b/tutorials/multimodal/multimodal.json index 6910ab3..12037e7 100644 --- a/tutorials/multimodal/multimodal.json +++ b/tutorials/multimodal/multimodal.json @@ -103,6 +103,10 @@ } }, "tokenizer_settings": {}, + "special_tokens_settings": { + "bos_token": "", + "eos_token": "" + }, "trainer_settings": { "evaluation_strategy": "steps", "save_strategy": "steps", diff --git a/tutorials/rm/rm.json b/tutorials/rm/rm.json index 758d6db..d552358 100755 --- a/tutorials/rm/rm.json +++ b/tutorials/rm/rm.json @@ -80,6 +80,10 @@ } }, "tokenizer_settings": {}, + "special_tokens_settings": { + "bos_token": "<|begin_of_text|>", + "eos_token": "<|end_of_text|>" + }, "trainer_settings": { "evaluation_strategy": "steps", "per_device_train_batch_size": 1, diff --git a/tutorials/sft/sft.json b/tutorials/sft/sft.json index 18de973..674b658 100755 --- a/tutorials/sft/sft.json +++ b/tutorials/sft/sft.json @@ -84,6 +84,10 @@ ] }, "tokenizer_settings": {}, + "special_tokens_settings": { + "bos_token": "<|begin_of_text|>", + "eos_token": "<|end_of_text|>" + }, "trainer_settings": { "evaluation_strategy": "steps", "save_total_limit": 5,