Skip to content

Commit

Permalink
🔠 Add SpecialTokensSettings
Browse files Browse the repository at this point in the history
  • Loading branch information
syrn1k authored Sep 5, 2024
2 parents 633f198 + 5d6f657 commit 184e19c
Show file tree
Hide file tree
Showing 32 changed files with 173 additions and 19 deletions.
5 changes: 5 additions & 0 deletions configs/exp/train/classification/classification.json
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,11 @@
}
},
"tokenizer_settings": {},
"special_tokens_settings": {
"bos_token": "<s>",
"eos_token": "</s>",
"pad_token": "<pad>"
},
"trainer_settings": {
"evaluation_strategy": "steps",
"per_device_train_batch_size": 1,
Expand Down
4 changes: 4 additions & 0 deletions configs/exp/train/dpo/dpo.json
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,10 @@
}
},
"tokenizer_settings": {},
"special_tokens_settings": {
"bos_token": "<|begin_of_text|>",
"eos_token": "<|end_of_text|>"
},
"trainer_settings": {
"evaluation_strategy": "steps",
"per_device_train_batch_size": 1,
Expand Down
4 changes: 4 additions & 0 deletions configs/exp/train/multimodal/c_abs.json
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,10 @@
"tokenizer_settings": {
"tokenizer_path": "/from_s3/model"
},
"special_tokens_settings": {
"bos_token": "<s>",
"eos_token": "</s>"
},
"trainer_settings": {
"evaluation_strategy": "epoch",
"save_strategy": "epoch",
Expand Down
Empty file.
4 changes: 4 additions & 0 deletions configs/exp/train/multimodal/mlp.json
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,10 @@
"tokenizer_settings": {
"tokenizer_path": "/from_s3/model"
},
"special_tokens_settings": {
"bos_token": "<s>",
"eos_token": "</s>"
},
"trainer_settings": {
"evaluation_strategy": "epoch",
"save_strategy": "epoch",
Expand Down
4 changes: 4 additions & 0 deletions configs/exp/train/rag/end2end_rag.json
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,10 @@
"metric_settings": []
},
"tokenizer_settings": {},
"special_tokens_settings": {
"bos_token": "<s>",
"eos_token": "</s>"
},
"trainer_settings": {
"evaluation_strategy": "steps",
"save_strategy": "steps",
Expand Down
4 changes: 4 additions & 0 deletions configs/exp/train/rm/rm.json
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,10 @@
}
},
"tokenizer_settings": {},
"special_tokens_settings": {
"bos_token": "<|begin_of_text|>",
"eos_token": "<|end_of_text|>"
},
"trainer_settings": {
"evaluation_strategy": "steps",
"per_device_train_batch_size": 1,
Expand Down
4 changes: 4 additions & 0 deletions configs/exp/train/sft/sft.json
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,10 @@
]
},
"tokenizer_settings": {},
"special_tokens_settings": {
"bos_token": "<|begin_of_text|>",
"eos_token": "<|end_of_text|>"
},
"trainer_settings": {
"evaluation_strategy": "steps",
"save_total_limit": 5,
Expand Down
5 changes: 5 additions & 0 deletions tests/fixtures/configs/train/classification/base.json
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,11 @@
}
},
"tokenizer_settings": {},
"special_tokens_settings": {
"bos_token": "<s>",
"eos_token": "</s>",
"pad_token": "<pad>"
},
"trainer_settings": {
"evaluation_strategy": "steps",
"per_device_train_batch_size": 1,
Expand Down
5 changes: 5 additions & 0 deletions tests/fixtures/configs/train/ddpo/base.json
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,11 @@
},
"chat_tokenizer_settings": {},
"rm_tokenizer_settings": {},
"special_tokens_settings": {
"bos_token": "<s>",
"eos_token": "</s>",
"pad_token": "<pad>"
},
"trainer_settings": {
"evaluation_strategy": "steps",
"per_device_train_batch_size": 1,
Expand Down
5 changes: 5 additions & 0 deletions tests/fixtures/configs/train/dpo/base.json
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,11 @@
]
},
"tokenizer_settings": {},
"special_tokens_settings": {
"bos_token": "<s>",
"eos_token": "</s>",
"pad_token": "<pad>"
},
"trainer_settings": {
"evaluation_strategy": "steps",
"per_device_train_batch_size": 2,
Expand Down
5 changes: 5 additions & 0 deletions tests/fixtures/configs/train/dpo/simpo.json
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,11 @@
]
},
"tokenizer_settings": {},
"special_tokens_settings": {
"bos_token": "<s>",
"eos_token": "</s>",
"pad_token": "<pad>"
},
"trainer_settings": {
"evaluation_strategy": "steps",
"per_device_train_batch_size": 2,
Expand Down
5 changes: 5 additions & 0 deletions tests/fixtures/configs/train/kto/base.json
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,11 @@
"metric_settings": []
},
"tokenizer_settings": {},
"special_tokens_settings": {
"bos_token": "<s>",
"eos_token": "</s>",
"pad_token": "<pad>"
},
"trainer_settings": {
"evaluation_strategy": "steps",
"per_device_train_batch_size": 4,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,11 @@
}
},
"tokenizer_settings": {},
"special_tokens_settings": {
"bos_token": "<s>",
"eos_token": "</s>",
"pad_token": "<pad>"
},
"trainer_settings": {
"evaluation_strategy": "epoch",
"save_strategy": "epoch",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,11 @@
}
},
"tokenizer_settings": {},
"special_tokens_settings": {
"bos_token": "<s>",
"eos_token": "</s>",
"pad_token": "<pad>"
},
"trainer_settings": {
"evaluation_strategy": "epoch",
"save_strategy": "epoch",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,11 @@
}
},
"tokenizer_settings": {},
"special_tokens_settings": {
"bos_token": "<s>",
"eos_token": "</s>",
"pad_token": "<pad>"
},
"trainer_settings": {
"evaluation_strategy": "epoch",
"save_strategy": "epoch",
Expand Down
5 changes: 5 additions & 0 deletions tests/fixtures/configs/train/rag/base.json
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,11 @@
"metric_settings": []
},
"tokenizer_settings": {},
"special_tokens_settings": {
"bos_token": "<s>",
"eos_token": "</s>",
"pad_token": "<pad>"
},
"trainer_settings": {
"evaluation_strategy": "epoch",
"save_strategy": "epoch",
Expand Down
5 changes: 5 additions & 0 deletions tests/fixtures/configs/train/rm/base.json
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,11 @@
}
},
"tokenizer_settings": {},
"special_tokens_settings": {
"bos_token": "<s>",
"eos_token": "</s>",
"pad_token": "<pad>"
},
"trainer_settings": {
"evaluation_strategy": "steps",
"per_device_train_batch_size": 1,
Expand Down
5 changes: 5 additions & 0 deletions tests/fixtures/configs/train/sft/base.json
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,11 @@
"metric_settings": []
},
"tokenizer_settings": {},
"special_tokens_settings": {
"bos_token": "<s>",
"eos_token": "</s>",
"pad_token": "<pad>"
},
"trainer_settings": {
"evaluation_strategy": "steps",
"per_device_train_batch_size": 1,
Expand Down
5 changes: 5 additions & 0 deletions tests/fixtures/configs/train/sft/prompt_tuning.json
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,11 @@
"metric_settings": []
},
"tokenizer_settings": {},
"special_tokens_settings": {
"bos_token": "<s>",
"eos_token": "</s>",
"pad_token": "<pad>"
},
"trainer_settings": {
"evaluation_strategy": "steps",
"per_device_train_batch_size": 1,
Expand Down
5 changes: 5 additions & 0 deletions tests/fixtures/configs/train/sft/resume_from_checkpoint.json
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,11 @@
"metric_settings": []
},
"tokenizer_settings": {},
"special_tokens_settings": {
"bos_token": "<s>",
"eos_token": "</s>",
"pad_token": "<pad>"
},
"trainer_settings": {
"evaluation_strategy": "steps",
"per_device_train_batch_size": 1,
Expand Down
5 changes: 5 additions & 0 deletions tests/fixtures/configs/train/sft/sft_with_rm_metric.json
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,11 @@
]
},
"tokenizer_settings": {},
"special_tokens_settings": {
"bos_token": "<s>",
"eos_token": "</s>",
"pad_token": "<pad>"
},
"trainer_settings": {
"evaluation_strategy": "steps",
"per_device_train_batch_size": 1,
Expand Down
55 changes: 39 additions & 16 deletions turbo_alignment/common/tf/special_tokens_setter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,18 @@
from transformers.tokenization_utils_base import PreTrainedTokenizerBase

from turbo_alignment.common.logging import get_project_logger
from turbo_alignment.settings.tf.special_tokens_setter import SpecialTokensSettings

logger = get_project_logger()


class SpecialTokensSetter:
def __init__(self, tokenizer: PreTrainedTokenizerBase) -> None:
def __init__(self, tokenizer: PreTrainedTokenizerBase, special_tokens_settings: SpecialTokensSettings) -> None:
self._tokenizer = tokenizer
self._special_tokens_settings = special_tokens_settings
self._special_tokens_already_set: bool = False

def setBOS(self, bos_token: str = '<s>') -> None:
def setBOS(self, bos_token: str) -> None:
if self._tokenizer.bos_token_id is None:
logger.info('Model does not have bos_token_id')
self._tokenizer.add_special_tokens(special_tokens_dict={'bos_token': bos_token})
Expand All @@ -20,7 +22,7 @@ def setBOS(self, bos_token: str = '<s>') -> None:
else:
logger.info(f'Model has bos_token_id = {self._tokenizer.bos_token_id}')

def setEOS(self, eos_token: str = '</s>') -> None:
def setEOS(self, eos_token: str) -> None:
if self._tokenizer.eos_token_id is None:
logger.info('Model does not have eos_token_id')
self._tokenizer.add_special_tokens(special_tokens_dict={'eos_token': eos_token})
Expand All @@ -29,7 +31,11 @@ def setEOS(self, eos_token: str = '</s>') -> None:
else:
logger.info(f'Model has eos_token_id = {self._tokenizer.eos_token_id}')

def setPAD(self, pad_token: str = '<pad>') -> None:
def setPAD(self, pad_token: str | None) -> None:
if self._tokenizer.pad_token_id is None and pad_token is None:
logger.info('Skip adding pad_token_id')
return None

if self._tokenizer.pad_token_id is None:
logger.info('Model does not have pad_token_id')
self._tokenizer.add_special_tokens(special_tokens_dict={'pad_token': pad_token})
Expand All @@ -38,7 +44,13 @@ def setPAD(self, pad_token: str = '<pad>') -> None:
else:
logger.info(f'Model has pad_token_id = {self._tokenizer.pad_token_id}')

def setUNK(self, unk_token: str = '<unk>') -> None:
return None

def setUNK(self, unk_token: str | None) -> None:
if self._tokenizer.unk_token_id is None and unk_token is None:
logger.info('Skip adding sep_token_id')
return None

if self._tokenizer.unk_token_id is None:
logger.info('Model does not have unk_token_id')
self._tokenizer.add_special_tokens(special_tokens_dict={'unk_token': unk_token})
Expand All @@ -47,7 +59,13 @@ def setUNK(self, unk_token: str = '<unk>') -> None:
else:
logger.info(f'Model has unk_token_id = {self._tokenizer.unk_token_id}')

def setSEP(self, sep_token: str = '<sep>') -> None:
return None

def setSEP(self, sep_token: str | None) -> None:
if self._tokenizer.sep_token_id is None and sep_token is None:
logger.info('Skip adding sep_token_id')
return None

if self._tokenizer.sep_token_id is None:
logger.info('Model does not have sep_token_id')
self._tokenizer.add_special_tokens(special_tokens_dict={'sep_token': sep_token})
Expand All @@ -56,12 +74,14 @@ def setSEP(self, sep_token: str = '<sep>') -> None:
else:
logger.info(f'Model has sep_token_id = {self._tokenizer.sep_token_id}')

def set_all(self):
self.setBOS()
self.setEOS()
self.setPAD()
self.setUNK()
self.setSEP()
return None

def set_all(self) -> None:
self.setBOS(bos_token=self._special_tokens_settings.bos_token)
self.setEOS(eos_token=self._special_tokens_settings.eos_token)
self.setPAD(pad_token=self._special_tokens_settings.pad_token)
self.setUNK(unk_token=self._special_tokens_settings.unk_token)
self.setSEP(sep_token=self._special_tokens_settings.sep_token)

def set_custom_tokens(self, tokens: list[str]) -> None:
if self._special_tokens_already_set:
Expand All @@ -73,8 +93,11 @@ def set_custom_tokens(self, tokens: list[str]) -> None:
added_tokens = self._tokenizer.add_special_tokens({'additional_special_tokens': tokens})
assert added_tokens == len(tokens)

def setup_model_config(self, model: PreTrainedModel):
model.config.eos_token_id = self._tokenizer.eos_token_id
def setup_model_config(self, model: PreTrainedModel) -> None:
model.config.bos_token_id = self._tokenizer.bos_token_id
model.config.pad_token_id = self._tokenizer.pad_token_id
model.config.sep_token_id = self._tokenizer.sep_token_id
model.config.eos_token_id = self._tokenizer.eos_token_id

if self._tokenizer.pad_token_id is not None:
model.config.pad_token_id = self._tokenizer.pad_token_id
if self._tokenizer.sep_token_id is not None:
model.config.sep_token_id = self._tokenizer.sep_token_id
2 changes: 1 addition & 1 deletion turbo_alignment/pipelines/train/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ def run(self, experiment_settings: ExperimentSettingsT) -> None:

additional_special_tokens = self._get_additional_special_tokens(experiment_settings)
logger.info(f'Special tokens: {additional_special_tokens}')
special_tokens_setter = SpecialTokensSetter(self.tokenizer)
special_tokens_setter = SpecialTokensSetter(self.tokenizer, experiment_settings.special_tokens_settings)
special_tokens_setter.set_all()
special_tokens_setter.set_custom_tokens(additional_special_tokens)

Expand Down
4 changes: 2 additions & 2 deletions turbo_alignment/pipelines/train/ddpo.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,14 +129,14 @@ def run(self, experiment_settings: DDPOTrainExperimentSettings) -> None:
additional_special_tokens = self._get_additional_special_tokens(experiment_settings)

logger.info(f'Special tokens: {additional_special_tokens}')
special_tokens_setter = SpecialTokensSetter(self.rm_tokenizer)
special_tokens_setter = SpecialTokensSetter(self.rm_tokenizer, experiment_settings.special_tokens_settings)
special_tokens_setter.set_all()
special_tokens_setter.set_custom_tokens(additional_special_tokens)

logger.info('RM Special tokens added!')

logger.info(f'Special tokens: {additional_special_tokens}')
special_tokens_setter = SpecialTokensSetter(self.tokenizer)
special_tokens_setter = SpecialTokensSetter(self.tokenizer, experiment_settings.special_tokens_settings)
special_tokens_setter.set_all()
special_tokens_setter.set_custom_tokens(additional_special_tokens)

Expand Down
3 changes: 3 additions & 0 deletions turbo_alignment/settings/pipelines/train/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
PreTrainedModelSettings,
)
from turbo_alignment.settings.s3 import CheckpointUploaderCallbackParameters
from turbo_alignment.settings.tf.special_tokens_setter import SpecialTokensSettings
from turbo_alignment.settings.tf.tokenizer import TokenizerSettings
from turbo_alignment.settings.tf.trainer import TrainerSettings
from turbo_alignment.settings.weights_and_biases import WandbSettings
Expand All @@ -26,7 +27,9 @@ class BaseTrainExperimentSettings(BaseSettings):
seed: int = 42

trainer_settings: TrainerSettings

tokenizer_settings: TokenizerSettings
special_tokens_settings: SpecialTokensSettings

model_settings: (ModelForPeftSettings | PreTrainedModelSettings | PreTrainedAdaptersModelSettings)

Expand Down
Loading

0 comments on commit 184e19c

Please sign in to comment.