diff --git a/configs/exp/train/classification/classification.json b/configs/exp/train/classification/classification.json
index af851ed..5257230 100755
--- a/configs/exp/train/classification/classification.json
+++ b/configs/exp/train/classification/classification.json
@@ -86,6 +86,11 @@
}
},
"tokenizer_settings": {},
+ "special_tokens_settings": {
+ "bos_token": "",
+ "eos_token": "",
+ "pad_token": ""
+ },
"trainer_settings": {
"evaluation_strategy": "steps",
"per_device_train_batch_size": 1,
diff --git a/configs/exp/train/dpo/dpo.json b/configs/exp/train/dpo/dpo.json
index 4e4dc21..61bb787 100755
--- a/configs/exp/train/dpo/dpo.json
+++ b/configs/exp/train/dpo/dpo.json
@@ -96,6 +96,10 @@
}
},
"tokenizer_settings": {},
+ "special_tokens_settings": {
+ "bos_token": "<|begin_of_text|>",
+ "eos_token": "<|end_of_text|>"
+ },
"trainer_settings": {
"evaluation_strategy": "steps",
"per_device_train_batch_size": 1,
diff --git a/configs/exp/train/multimodal/c_abs.json b/configs/exp/train/multimodal/c_abs.json
index 713abeb..46f00f7 100644
--- a/configs/exp/train/multimodal/c_abs.json
+++ b/configs/exp/train/multimodal/c_abs.json
@@ -105,6 +105,10 @@
"tokenizer_settings": {
"tokenizer_path": "/from_s3/model"
},
+ "special_tokens_settings": {
+ "bos_token": "",
+ "eos_token": ""
+ },
"trainer_settings": {
"evaluation_strategy": "epoch",
"save_strategy": "epoch",
diff --git a/configs/exp/train/multimodal/llava.json b/configs/exp/train/multimodal/llava.json
deleted file mode 100644
index e69de29..0000000
diff --git a/configs/exp/train/multimodal/mlp.json b/configs/exp/train/multimodal/mlp.json
index afb2f2f..68a8925 100644
--- a/configs/exp/train/multimodal/mlp.json
+++ b/configs/exp/train/multimodal/mlp.json
@@ -105,6 +105,10 @@
"tokenizer_settings": {
"tokenizer_path": "/from_s3/model"
},
+ "special_tokens_settings": {
+ "bos_token": "",
+ "eos_token": ""
+ },
"trainer_settings": {
"evaluation_strategy": "epoch",
"save_strategy": "epoch",
diff --git a/configs/exp/train/rag/end2end_rag.json b/configs/exp/train/rag/end2end_rag.json
index 28c109a..e0b5cad 100755
--- a/configs/exp/train/rag/end2end_rag.json
+++ b/configs/exp/train/rag/end2end_rag.json
@@ -112,6 +112,10 @@
"metric_settings": []
},
"tokenizer_settings": {},
+ "special_tokens_settings": {
+ "bos_token": "",
+ "eos_token": ""
+ },
"trainer_settings": {
"evaluation_strategy": "steps",
"save_strategy": "steps",
diff --git a/configs/exp/train/rm/rm.json b/configs/exp/train/rm/rm.json
index f46d2f0..b9bd08c 100755
--- a/configs/exp/train/rm/rm.json
+++ b/configs/exp/train/rm/rm.json
@@ -83,6 +83,10 @@
}
},
"tokenizer_settings": {},
+ "special_tokens_settings": {
+ "bos_token": "<|begin_of_text|>",
+ "eos_token": "<|end_of_text|>"
+ },
"trainer_settings": {
"evaluation_strategy": "steps",
"per_device_train_batch_size": 1,
diff --git a/configs/exp/train/sft/sft.json b/configs/exp/train/sft/sft.json
index 14f5606..f746e83 100755
--- a/configs/exp/train/sft/sft.json
+++ b/configs/exp/train/sft/sft.json
@@ -104,6 +104,10 @@
]
},
"tokenizer_settings": {},
+ "special_tokens_settings": {
+ "bos_token": "<|begin_of_text|>",
+ "eos_token": "<|end_of_text|>"
+},
"trainer_settings": {
"evaluation_strategy": "steps",
"save_total_limit": 5,
diff --git a/tests/fixtures/configs/train/classification/base.json b/tests/fixtures/configs/train/classification/base.json
index b586c59..c1f4158 100755
--- a/tests/fixtures/configs/train/classification/base.json
+++ b/tests/fixtures/configs/train/classification/base.json
@@ -87,6 +87,11 @@
}
},
"tokenizer_settings": {},
+ "special_tokens_settings": {
+ "bos_token": "",
+ "eos_token": "",
+ "pad_token": ""
+ },
"trainer_settings": {
"evaluation_strategy": "steps",
"per_device_train_batch_size": 1,
diff --git a/tests/fixtures/configs/train/ddpo/base.json b/tests/fixtures/configs/train/ddpo/base.json
index 7da3ff8..57e6b2d 100755
--- a/tests/fixtures/configs/train/ddpo/base.json
+++ b/tests/fixtures/configs/train/ddpo/base.json
@@ -127,6 +127,11 @@
},
"chat_tokenizer_settings": {},
"rm_tokenizer_settings": {},
+ "special_tokens_settings": {
+ "bos_token": "",
+ "eos_token": "",
+ "pad_token": ""
+ },
"trainer_settings": {
"evaluation_strategy": "steps",
"per_device_train_batch_size": 1,
diff --git a/tests/fixtures/configs/train/dpo/base.json b/tests/fixtures/configs/train/dpo/base.json
index a06caad..514d462 100755
--- a/tests/fixtures/configs/train/dpo/base.json
+++ b/tests/fixtures/configs/train/dpo/base.json
@@ -106,6 +106,11 @@
]
},
"tokenizer_settings": {},
+ "special_tokens_settings": {
+ "bos_token": "",
+ "eos_token": "",
+ "pad_token": ""
+ },
"trainer_settings": {
"evaluation_strategy": "steps",
"per_device_train_batch_size": 2,
diff --git a/tests/fixtures/configs/train/dpo/simpo.json b/tests/fixtures/configs/train/dpo/simpo.json
index 593463d..01233b8 100755
--- a/tests/fixtures/configs/train/dpo/simpo.json
+++ b/tests/fixtures/configs/train/dpo/simpo.json
@@ -98,6 +98,11 @@
]
},
"tokenizer_settings": {},
+ "special_tokens_settings": {
+ "bos_token": "",
+ "eos_token": "",
+ "pad_token": ""
+ },
"trainer_settings": {
"evaluation_strategy": "steps",
"per_device_train_batch_size": 2,
diff --git a/tests/fixtures/configs/train/kto/base.json b/tests/fixtures/configs/train/kto/base.json
index 28dc7a4..af53a14 100755
--- a/tests/fixtures/configs/train/kto/base.json
+++ b/tests/fixtures/configs/train/kto/base.json
@@ -84,6 +84,11 @@
"metric_settings": []
},
"tokenizer_settings": {},
+ "special_tokens_settings": {
+ "bos_token": "",
+ "eos_token": "",
+ "pad_token": ""
+ },
"trainer_settings": {
"evaluation_strategy": "steps",
"per_device_train_batch_size": 4,
diff --git a/tests/fixtures/configs/train/multimodal/llama_c_abs_clip_pickle.json b/tests/fixtures/configs/train/multimodal/llama_c_abs_clip_pickle.json
index 0195034..49c2b9c 100644
--- a/tests/fixtures/configs/train/multimodal/llama_c_abs_clip_pickle.json
+++ b/tests/fixtures/configs/train/multimodal/llama_c_abs_clip_pickle.json
@@ -103,6 +103,11 @@
}
},
"tokenizer_settings": {},
+ "special_tokens_settings": {
+ "bos_token": "",
+ "eos_token": "",
+ "pad_token": ""
+ },
"trainer_settings": {
"evaluation_strategy": "epoch",
"save_strategy": "epoch",
diff --git a/tests/fixtures/configs/train/multimodal/llama_llava_base_clip.json b/tests/fixtures/configs/train/multimodal/llama_llava_base_clip.json
index 98fbad8..01b5dce 100644
--- a/tests/fixtures/configs/train/multimodal/llama_llava_base_clip.json
+++ b/tests/fixtures/configs/train/multimodal/llama_llava_base_clip.json
@@ -103,6 +103,11 @@
}
},
"tokenizer_settings": {},
+ "special_tokens_settings": {
+ "bos_token": "",
+ "eos_token": "",
+ "pad_token": ""
+ },
"trainer_settings": {
"evaluation_strategy": "epoch",
"save_strategy": "epoch",
diff --git a/tests/fixtures/configs/train/multimodal/llama_llava_clip_pickle.json b/tests/fixtures/configs/train/multimodal/llama_llava_clip_pickle.json
index d7a46e7..a16a69c 100644
--- a/tests/fixtures/configs/train/multimodal/llama_llava_clip_pickle.json
+++ b/tests/fixtures/configs/train/multimodal/llama_llava_clip_pickle.json
@@ -103,6 +103,11 @@
}
},
"tokenizer_settings": {},
+ "special_tokens_settings": {
+ "bos_token": "",
+ "eos_token": "",
+ "pad_token": ""
+ },
"trainer_settings": {
"evaluation_strategy": "epoch",
"save_strategy": "epoch",
diff --git a/tests/fixtures/configs/train/rag/base.json b/tests/fixtures/configs/train/rag/base.json
index f11b4b8..5fc808e 100755
--- a/tests/fixtures/configs/train/rag/base.json
+++ b/tests/fixtures/configs/train/rag/base.json
@@ -118,6 +118,11 @@
"metric_settings": []
},
"tokenizer_settings": {},
+ "special_tokens_settings": {
+ "bos_token": "",
+ "eos_token": "",
+ "pad_token": ""
+},
"trainer_settings": {
"evaluation_strategy": "epoch",
"save_strategy": "epoch",
diff --git a/tests/fixtures/configs/train/rm/base.json b/tests/fixtures/configs/train/rm/base.json
index b4bede7..e3fed58 100755
--- a/tests/fixtures/configs/train/rm/base.json
+++ b/tests/fixtures/configs/train/rm/base.json
@@ -89,6 +89,11 @@
}
},
"tokenizer_settings": {},
+ "special_tokens_settings": {
+ "bos_token": "",
+ "eos_token": "",
+ "pad_token": ""
+ },
"trainer_settings": {
"evaluation_strategy": "steps",
"per_device_train_batch_size": 1,
diff --git a/tests/fixtures/configs/train/sft/base.json b/tests/fixtures/configs/train/sft/base.json
index b5dc9fd..1a48d35 100755
--- a/tests/fixtures/configs/train/sft/base.json
+++ b/tests/fixtures/configs/train/sft/base.json
@@ -99,6 +99,11 @@
"metric_settings": []
},
"tokenizer_settings": {},
+ "special_tokens_settings": {
+ "bos_token": "",
+ "eos_token": "",
+ "pad_token": ""
+ },
"trainer_settings": {
"evaluation_strategy": "steps",
"per_device_train_batch_size": 1,
diff --git a/tests/fixtures/configs/train/sft/prompt_tuning.json b/tests/fixtures/configs/train/sft/prompt_tuning.json
index c187176..ca3eb88 100755
--- a/tests/fixtures/configs/train/sft/prompt_tuning.json
+++ b/tests/fixtures/configs/train/sft/prompt_tuning.json
@@ -94,6 +94,11 @@
"metric_settings": []
},
"tokenizer_settings": {},
+ "special_tokens_settings": {
+ "bos_token": "",
+ "eos_token": "",
+ "pad_token": ""
+ },
"trainer_settings": {
"evaluation_strategy": "steps",
"per_device_train_batch_size": 1,
diff --git a/tests/fixtures/configs/train/sft/resume_from_checkpoint.json b/tests/fixtures/configs/train/sft/resume_from_checkpoint.json
index 95336ba..f7e75a4 100755
--- a/tests/fixtures/configs/train/sft/resume_from_checkpoint.json
+++ b/tests/fixtures/configs/train/sft/resume_from_checkpoint.json
@@ -78,6 +78,11 @@
"metric_settings": []
},
"tokenizer_settings": {},
+ "special_tokens_settings": {
+ "bos_token": "",
+ "eos_token": "",
+ "pad_token": ""
+ },
"trainer_settings": {
"evaluation_strategy": "steps",
"per_device_train_batch_size": 1,
diff --git a/tests/fixtures/configs/train/sft/sft_with_rm_metric.json b/tests/fixtures/configs/train/sft/sft_with_rm_metric.json
index c281971..ee22909 100755
--- a/tests/fixtures/configs/train/sft/sft_with_rm_metric.json
+++ b/tests/fixtures/configs/train/sft/sft_with_rm_metric.json
@@ -121,6 +121,11 @@
]
},
"tokenizer_settings": {},
+ "special_tokens_settings": {
+ "bos_token": "",
+ "eos_token": "",
+ "pad_token": ""
+ },
"trainer_settings": {
"evaluation_strategy": "steps",
"per_device_train_batch_size": 1,
diff --git a/turbo_alignment/common/tf/special_tokens_setter.py b/turbo_alignment/common/tf/special_tokens_setter.py
index 3ceb313..b1aaeb6 100755
--- a/turbo_alignment/common/tf/special_tokens_setter.py
+++ b/turbo_alignment/common/tf/special_tokens_setter.py
@@ -2,16 +2,18 @@
from transformers.tokenization_utils_base import PreTrainedTokenizerBase
from turbo_alignment.common.logging import get_project_logger
+from turbo_alignment.settings.tf.special_tokens_setter import SpecialTokensSettings
logger = get_project_logger()
class SpecialTokensSetter:
- def __init__(self, tokenizer: PreTrainedTokenizerBase) -> None:
+ def __init__(self, tokenizer: PreTrainedTokenizerBase, special_tokens_settings: SpecialTokensSettings) -> None:
self._tokenizer = tokenizer
+ self._special_tokens_settings = special_tokens_settings
self._special_tokens_already_set: bool = False
- def setBOS(self, bos_token: str = '') -> None:
+ def setBOS(self, bos_token: str) -> None:
if self._tokenizer.bos_token_id is None:
logger.info('Model does not have bos_token_id')
self._tokenizer.add_special_tokens(special_tokens_dict={'bos_token': bos_token})
@@ -20,7 +22,7 @@ def setBOS(self, bos_token: str = '') -> None:
else:
logger.info(f'Model has bos_token_id = {self._tokenizer.bos_token_id}')
- def setEOS(self, eos_token: str = '') -> None:
+ def setEOS(self, eos_token: str) -> None:
if self._tokenizer.eos_token_id is None:
logger.info('Model does not have eos_token_id')
self._tokenizer.add_special_tokens(special_tokens_dict={'eos_token': eos_token})
@@ -29,7 +31,11 @@ def setEOS(self, eos_token: str = '') -> None:
else:
logger.info(f'Model has eos_token_id = {self._tokenizer.eos_token_id}')
- def setPAD(self, pad_token: str = '') -> None:
+ def setPAD(self, pad_token: str | None) -> None:
+ if self._tokenizer.pad_token_id is None and pad_token is None:
+ logger.info('Skip adding pad_token_id')
+ return None
+
if self._tokenizer.pad_token_id is None:
logger.info('Model does not have pad_token_id')
self._tokenizer.add_special_tokens(special_tokens_dict={'pad_token': pad_token})
@@ -38,7 +44,13 @@ def setPAD(self, pad_token: str = '') -> None:
else:
logger.info(f'Model has pad_token_id = {self._tokenizer.pad_token_id}')
- def setUNK(self, unk_token: str = '') -> None:
+ return None
+
+ def setUNK(self, unk_token: str | None) -> None:
+ if self._tokenizer.unk_token_id is None and unk_token is None:
+ logger.info('Skip adding sep_token_id')
+ return None
+
if self._tokenizer.unk_token_id is None:
logger.info('Model does not have unk_token_id')
self._tokenizer.add_special_tokens(special_tokens_dict={'unk_token': unk_token})
@@ -47,7 +59,13 @@ def setUNK(self, unk_token: str = '') -> None:
else:
logger.info(f'Model has unk_token_id = {self._tokenizer.unk_token_id}')
- def setSEP(self, sep_token: str = '') -> None:
+ return None
+
+ def setSEP(self, sep_token: str | None) -> None:
+ if self._tokenizer.sep_token_id is None and sep_token is None:
+ logger.info('Skip adding sep_token_id')
+ return None
+
if self._tokenizer.sep_token_id is None:
logger.info('Model does not have sep_token_id')
self._tokenizer.add_special_tokens(special_tokens_dict={'sep_token': sep_token})
@@ -56,12 +74,14 @@ def setSEP(self, sep_token: str = '') -> None:
else:
logger.info(f'Model has sep_token_id = {self._tokenizer.sep_token_id}')
- def set_all(self):
- self.setBOS()
- self.setEOS()
- self.setPAD()
- self.setUNK()
- self.setSEP()
+ return None
+
+ def set_all(self) -> None:
+ self.setBOS(bos_token=self._special_tokens_settings.bos_token)
+ self.setEOS(eos_token=self._special_tokens_settings.eos_token)
+ self.setPAD(pad_token=self._special_tokens_settings.pad_token)
+ self.setUNK(unk_token=self._special_tokens_settings.unk_token)
+ self.setSEP(sep_token=self._special_tokens_settings.sep_token)
def set_custom_tokens(self, tokens: list[str]) -> None:
if self._special_tokens_already_set:
@@ -73,8 +93,11 @@ def set_custom_tokens(self, tokens: list[str]) -> None:
added_tokens = self._tokenizer.add_special_tokens({'additional_special_tokens': tokens})
assert added_tokens == len(tokens)
- def setup_model_config(self, model: PreTrainedModel):
- model.config.eos_token_id = self._tokenizer.eos_token_id
+ def setup_model_config(self, model: PreTrainedModel) -> None:
model.config.bos_token_id = self._tokenizer.bos_token_id
- model.config.pad_token_id = self._tokenizer.pad_token_id
- model.config.sep_token_id = self._tokenizer.sep_token_id
+ model.config.eos_token_id = self._tokenizer.eos_token_id
+
+ if self._tokenizer.pad_token_id is not None:
+ model.config.pad_token_id = self._tokenizer.pad_token_id
+ if self._tokenizer.sep_token_id is not None:
+ model.config.sep_token_id = self._tokenizer.sep_token_id
diff --git a/turbo_alignment/pipelines/train/base.py b/turbo_alignment/pipelines/train/base.py
index 20bd8c8..57e4a20 100755
--- a/turbo_alignment/pipelines/train/base.py
+++ b/turbo_alignment/pipelines/train/base.py
@@ -145,7 +145,7 @@ def run(self, experiment_settings: ExperimentSettingsT) -> None:
additional_special_tokens = self._get_additional_special_tokens(experiment_settings)
logger.info(f'Special tokens: {additional_special_tokens}')
- special_tokens_setter = SpecialTokensSetter(self.tokenizer)
+ special_tokens_setter = SpecialTokensSetter(self.tokenizer, experiment_settings.special_tokens_settings)
special_tokens_setter.set_all()
special_tokens_setter.set_custom_tokens(additional_special_tokens)
diff --git a/turbo_alignment/pipelines/train/ddpo.py b/turbo_alignment/pipelines/train/ddpo.py
index a280b6f..2657fa1 100755
--- a/turbo_alignment/pipelines/train/ddpo.py
+++ b/turbo_alignment/pipelines/train/ddpo.py
@@ -129,14 +129,14 @@ def run(self, experiment_settings: DDPOTrainExperimentSettings) -> None:
additional_special_tokens = self._get_additional_special_tokens(experiment_settings)
logger.info(f'Special tokens: {additional_special_tokens}')
- special_tokens_setter = SpecialTokensSetter(self.rm_tokenizer)
+ special_tokens_setter = SpecialTokensSetter(self.rm_tokenizer, experiment_settings.special_tokens_settings)
special_tokens_setter.set_all()
special_tokens_setter.set_custom_tokens(additional_special_tokens)
logger.info('RM Special tokens added!')
logger.info(f'Special tokens: {additional_special_tokens}')
- special_tokens_setter = SpecialTokensSetter(self.tokenizer)
+ special_tokens_setter = SpecialTokensSetter(self.tokenizer, experiment_settings.special_tokens_settings)
special_tokens_setter.set_all()
special_tokens_setter.set_custom_tokens(additional_special_tokens)
diff --git a/turbo_alignment/settings/pipelines/train/base.py b/turbo_alignment/settings/pipelines/train/base.py
index 7ed9d66..7e2173e 100755
--- a/turbo_alignment/settings/pipelines/train/base.py
+++ b/turbo_alignment/settings/pipelines/train/base.py
@@ -11,6 +11,7 @@
PreTrainedModelSettings,
)
from turbo_alignment.settings.s3 import CheckpointUploaderCallbackParameters
+from turbo_alignment.settings.tf.special_tokens_setter import SpecialTokensSettings
from turbo_alignment.settings.tf.tokenizer import TokenizerSettings
from turbo_alignment.settings.tf.trainer import TrainerSettings
from turbo_alignment.settings.weights_and_biases import WandbSettings
@@ -26,7 +27,9 @@ class BaseTrainExperimentSettings(BaseSettings):
seed: int = 42
trainer_settings: TrainerSettings
+
tokenizer_settings: TokenizerSettings
+ special_tokens_settings: SpecialTokensSettings
model_settings: (ModelForPeftSettings | PreTrainedModelSettings | PreTrainedAdaptersModelSettings)
diff --git a/turbo_alignment/settings/tf/special_tokens_setter.py b/turbo_alignment/settings/tf/special_tokens_setter.py
new file mode 100755
index 0000000..fc6ab7e
--- /dev/null
+++ b/turbo_alignment/settings/tf/special_tokens_setter.py
@@ -0,0 +1,9 @@
+from turbo_alignment.settings.base import ExtraFieldsNotAllowedBaseModel
+
+
+class SpecialTokensSettings(ExtraFieldsNotAllowedBaseModel):
+ bos_token: str
+ eos_token: str
+ pad_token: str | None = None
+ unk_token: str | None = None
+ sep_token: str | None = None
diff --git a/tutorials/dpo/dpo.json b/tutorials/dpo/dpo.json
index 29c2bf0..7948c6e 100755
--- a/tutorials/dpo/dpo.json
+++ b/tutorials/dpo/dpo.json
@@ -93,6 +93,10 @@
"transformers_settings": {}
},
"tokenizer_settings": {},
+ "special_tokens_settings": {
+ "bos_token": "<|begin_of_text|>",
+ "eos_token": "<|end_of_text|>"
+ },
"trainer_settings": {
"evaluation_strategy": "steps",
"per_device_train_batch_size": 1,
diff --git a/tutorials/kto/kto.json b/tutorials/kto/kto.json
index 828eaac..9457b17 100644
--- a/tutorials/kto/kto.json
+++ b/tutorials/kto/kto.json
@@ -91,6 +91,10 @@
"transformers_settings": {}
},
"tokenizer_settings": {},
+ "special_tokens_settings": {
+ "bos_token": "<|begin_of_text|>",
+ "eos_token": "<|end_of_text|>"
+ },
"trainer_settings": {
"evaluation_strategy": "steps",
"per_device_train_batch_size": 1,
diff --git a/tutorials/multimodal/multimodal.json b/tutorials/multimodal/multimodal.json
index 6910ab3..12037e7 100644
--- a/tutorials/multimodal/multimodal.json
+++ b/tutorials/multimodal/multimodal.json
@@ -103,6 +103,10 @@
}
},
"tokenizer_settings": {},
+ "special_tokens_settings": {
+ "bos_token": "",
+ "eos_token": ""
+ },
"trainer_settings": {
"evaluation_strategy": "steps",
"save_strategy": "steps",
diff --git a/tutorials/rm/rm.json b/tutorials/rm/rm.json
index 758d6db..d552358 100755
--- a/tutorials/rm/rm.json
+++ b/tutorials/rm/rm.json
@@ -80,6 +80,10 @@
}
},
"tokenizer_settings": {},
+ "special_tokens_settings": {
+ "bos_token": "<|begin_of_text|>",
+ "eos_token": "<|end_of_text|>"
+ },
"trainer_settings": {
"evaluation_strategy": "steps",
"per_device_train_batch_size": 1,
diff --git a/tutorials/sft/sft.json b/tutorials/sft/sft.json
index 18de973..674b658 100755
--- a/tutorials/sft/sft.json
+++ b/tutorials/sft/sft.json
@@ -84,6 +84,10 @@
]
},
"tokenizer_settings": {},
+ "special_tokens_settings": {
+ "bos_token": "<|begin_of_text|>",
+ "eos_token": "<|end_of_text|>"
+ },
"trainer_settings": {
"evaluation_strategy": "steps",
"save_total_limit": 5,