Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🔠 Add SpecialTokensSettings #11

Merged
merged 3 commits into from
Sep 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions configs/exp/train/classification/classification.json
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,11 @@
}
},
"tokenizer_settings": {},
"special_tokens_settings": {
"bos_token": "<s>",
"eos_token": "</s>",
"pad_token": "<pad>"
},
"trainer_settings": {
"evaluation_strategy": "steps",
"per_device_train_batch_size": 1,
Expand Down
4 changes: 4 additions & 0 deletions configs/exp/train/dpo/dpo.json
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,10 @@
}
},
"tokenizer_settings": {},
"special_tokens_settings": {
"bos_token": "<|begin_of_text|>",
"eos_token": "<|end_of_text|>"
},
"trainer_settings": {
"evaluation_strategy": "steps",
"per_device_train_batch_size": 1,
Expand Down
4 changes: 4 additions & 0 deletions configs/exp/train/multimodal/c_abs.json
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,10 @@
"tokenizer_settings": {
"tokenizer_path": "/from_s3/model"
},
"special_tokens_settings": {
"bos_token": "<s>",
"eos_token": "</s>"
},
"trainer_settings": {
"evaluation_strategy": "epoch",
"save_strategy": "epoch",
Expand Down
Empty file.
4 changes: 4 additions & 0 deletions configs/exp/train/multimodal/mlp.json
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,10 @@
"tokenizer_settings": {
"tokenizer_path": "/from_s3/model"
},
"special_tokens_settings": {
"bos_token": "<s>",
"eos_token": "</s>"
},
"trainer_settings": {
"evaluation_strategy": "epoch",
"save_strategy": "epoch",
Expand Down
4 changes: 4 additions & 0 deletions configs/exp/train/rag/end2end_rag.json
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,10 @@
"metric_settings": []
},
"tokenizer_settings": {},
"special_tokens_settings": {
"bos_token": "<s>",
"eos_token": "</s>"
},
"trainer_settings": {
"evaluation_strategy": "steps",
"save_strategy": "steps",
Expand Down
4 changes: 4 additions & 0 deletions configs/exp/train/rm/rm.json
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,10 @@
}
},
"tokenizer_settings": {},
"special_tokens_settings": {
"bos_token": "<|begin_of_text|>",
"eos_token": "<|end_of_text|>"
},
"trainer_settings": {
"evaluation_strategy": "steps",
"per_device_train_batch_size": 1,
Expand Down
4 changes: 4 additions & 0 deletions configs/exp/train/sft/sft.json
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,10 @@
]
},
"tokenizer_settings": {},
"special_tokens_settings": {
"bos_token": "<|begin_of_text|>",
"eos_token": "<|end_of_text|>"
},
"trainer_settings": {
"evaluation_strategy": "steps",
"save_total_limit": 5,
Expand Down
5 changes: 5 additions & 0 deletions tests/fixtures/configs/train/classification/base.json
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,11 @@
}
},
"tokenizer_settings": {},
"special_tokens_settings": {
"bos_token": "<s>",
"eos_token": "</s>",
"pad_token": "<pad>"
},
"trainer_settings": {
"evaluation_strategy": "steps",
"per_device_train_batch_size": 1,
Expand Down
5 changes: 5 additions & 0 deletions tests/fixtures/configs/train/ddpo/base.json
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,11 @@
},
"chat_tokenizer_settings": {},
"rm_tokenizer_settings": {},
"special_tokens_settings": {
"bos_token": "<s>",
"eos_token": "</s>",
"pad_token": "<pad>"
},
"trainer_settings": {
"evaluation_strategy": "steps",
"per_device_train_batch_size": 1,
Expand Down
5 changes: 5 additions & 0 deletions tests/fixtures/configs/train/dpo/base.json
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,11 @@
]
},
"tokenizer_settings": {},
"special_tokens_settings": {
"bos_token": "<s>",
"eos_token": "</s>",
"pad_token": "<pad>"
},
"trainer_settings": {
"evaluation_strategy": "steps",
"per_device_train_batch_size": 2,
Expand Down
5 changes: 5 additions & 0 deletions tests/fixtures/configs/train/dpo/simpo.json
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,11 @@
]
},
"tokenizer_settings": {},
"special_tokens_settings": {
"bos_token": "<s>",
"eos_token": "</s>",
"pad_token": "<pad>"
},
"trainer_settings": {
"evaluation_strategy": "steps",
"per_device_train_batch_size": 2,
Expand Down
5 changes: 5 additions & 0 deletions tests/fixtures/configs/train/kto/base.json
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,11 @@
"metric_settings": []
},
"tokenizer_settings": {},
"special_tokens_settings": {
"bos_token": "<s>",
"eos_token": "</s>",
"pad_token": "<pad>"
},
"trainer_settings": {
"evaluation_strategy": "steps",
"per_device_train_batch_size": 4,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,11 @@
}
},
"tokenizer_settings": {},
"special_tokens_settings": {
"bos_token": "<s>",
"eos_token": "</s>",
"pad_token": "<pad>"
},
"trainer_settings": {
"evaluation_strategy": "epoch",
"save_strategy": "epoch",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,11 @@
}
},
"tokenizer_settings": {},
"special_tokens_settings": {
"bos_token": "<s>",
"eos_token": "</s>",
"pad_token": "<pad>"
},
"trainer_settings": {
"evaluation_strategy": "epoch",
"save_strategy": "epoch",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,11 @@
}
},
"tokenizer_settings": {},
"special_tokens_settings": {
"bos_token": "<s>",
"eos_token": "</s>",
"pad_token": "<pad>"
},
"trainer_settings": {
"evaluation_strategy": "epoch",
"save_strategy": "epoch",
Expand Down
5 changes: 5 additions & 0 deletions tests/fixtures/configs/train/rag/base.json
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,11 @@
"metric_settings": []
},
"tokenizer_settings": {},
"special_tokens_settings": {
"bos_token": "<s>",
"eos_token": "</s>",
"pad_token": "<pad>"
},
"trainer_settings": {
"evaluation_strategy": "epoch",
"save_strategy": "epoch",
Expand Down
5 changes: 5 additions & 0 deletions tests/fixtures/configs/train/rm/base.json
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,11 @@
}
},
"tokenizer_settings": {},
"special_tokens_settings": {
"bos_token": "<s>",
"eos_token": "</s>",
"pad_token": "<pad>"
},
"trainer_settings": {
"evaluation_strategy": "steps",
"per_device_train_batch_size": 1,
Expand Down
5 changes: 5 additions & 0 deletions tests/fixtures/configs/train/sft/base.json
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,11 @@
"metric_settings": []
},
"tokenizer_settings": {},
"special_tokens_settings": {
"bos_token": "<s>",
"eos_token": "</s>",
"pad_token": "<pad>"
},
"trainer_settings": {
"evaluation_strategy": "steps",
"per_device_train_batch_size": 1,
Expand Down
5 changes: 5 additions & 0 deletions tests/fixtures/configs/train/sft/prompt_tuning.json
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,11 @@
"metric_settings": []
},
"tokenizer_settings": {},
"special_tokens_settings": {
"bos_token": "<s>",
"eos_token": "</s>",
"pad_token": "<pad>"
},
"trainer_settings": {
"evaluation_strategy": "steps",
"per_device_train_batch_size": 1,
Expand Down
5 changes: 5 additions & 0 deletions tests/fixtures/configs/train/sft/resume_from_checkpoint.json
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,11 @@
"metric_settings": []
},
"tokenizer_settings": {},
"special_tokens_settings": {
"bos_token": "<s>",
"eos_token": "</s>",
"pad_token": "<pad>"
},
"trainer_settings": {
"evaluation_strategy": "steps",
"per_device_train_batch_size": 1,
Expand Down
5 changes: 5 additions & 0 deletions tests/fixtures/configs/train/sft/sft_with_rm_metric.json
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,11 @@
]
},
"tokenizer_settings": {},
"special_tokens_settings": {
"bos_token": "<s>",
"eos_token": "</s>",
"pad_token": "<pad>"
},
"trainer_settings": {
"evaluation_strategy": "steps",
"per_device_train_batch_size": 1,
Expand Down
55 changes: 39 additions & 16 deletions turbo_alignment/common/tf/special_tokens_setter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,18 @@
from transformers.tokenization_utils_base import PreTrainedTokenizerBase

from turbo_alignment.common.logging import get_project_logger
from turbo_alignment.settings.tf.special_tokens_setter import SpecialTokensSettings

logger = get_project_logger()


class SpecialTokensSetter:
def __init__(self, tokenizer: PreTrainedTokenizerBase) -> None:
def __init__(self, tokenizer: PreTrainedTokenizerBase, special_tokens_settings: SpecialTokensSettings) -> None:
self._tokenizer = tokenizer
self._special_tokens_settings = special_tokens_settings
self._special_tokens_already_set: bool = False

def setBOS(self, bos_token: str = '<s>') -> None:
def setBOS(self, bos_token: str) -> None:
if self._tokenizer.bos_token_id is None:
logger.info('Model does not have bos_token_id')
self._tokenizer.add_special_tokens(special_tokens_dict={'bos_token': bos_token})
Expand All @@ -20,7 +22,7 @@ def setBOS(self, bos_token: str = '<s>') -> None:
else:
logger.info(f'Model has bos_token_id = {self._tokenizer.bos_token_id}')

def setEOS(self, eos_token: str = '</s>') -> None:
def setEOS(self, eos_token: str) -> None:
if self._tokenizer.eos_token_id is None:
logger.info('Model does not have eos_token_id')
self._tokenizer.add_special_tokens(special_tokens_dict={'eos_token': eos_token})
Expand All @@ -29,7 +31,11 @@ def setEOS(self, eos_token: str = '</s>') -> None:
else:
logger.info(f'Model has eos_token_id = {self._tokenizer.eos_token_id}')

def setPAD(self, pad_token: str = '<pad>') -> None:
def setPAD(self, pad_token: str | None) -> None:
if self._tokenizer.pad_token_id is None and pad_token is None:
logger.info('Skip adding pad_token_id')
return None

if self._tokenizer.pad_token_id is None:
logger.info('Model does not have pad_token_id')
self._tokenizer.add_special_tokens(special_tokens_dict={'pad_token': pad_token})
Expand All @@ -38,7 +44,13 @@ def setPAD(self, pad_token: str = '<pad>') -> None:
else:
logger.info(f'Model has pad_token_id = {self._tokenizer.pad_token_id}')

def setUNK(self, unk_token: str = '<unk>') -> None:
return None

def setUNK(self, unk_token: str | None) -> None:
if self._tokenizer.unk_token_id is None and unk_token is None:
logger.info('Skip adding sep_token_id')
return None

if self._tokenizer.unk_token_id is None:
logger.info('Model does not have unk_token_id')
self._tokenizer.add_special_tokens(special_tokens_dict={'unk_token': unk_token})
Expand All @@ -47,7 +59,13 @@ def setUNK(self, unk_token: str = '<unk>') -> None:
else:
logger.info(f'Model has unk_token_id = {self._tokenizer.unk_token_id}')

def setSEP(self, sep_token: str = '<sep>') -> None:
return None

def setSEP(self, sep_token: str | None) -> None:
if self._tokenizer.sep_token_id is None and sep_token is None:
logger.info('Skip adding sep_token_id')
return None

if self._tokenizer.sep_token_id is None:
logger.info('Model does not have sep_token_id')
self._tokenizer.add_special_tokens(special_tokens_dict={'sep_token': sep_token})
Expand All @@ -56,12 +74,14 @@ def setSEP(self, sep_token: str = '<sep>') -> None:
else:
logger.info(f'Model has sep_token_id = {self._tokenizer.sep_token_id}')

def set_all(self):
self.setBOS()
self.setEOS()
self.setPAD()
self.setUNK()
self.setSEP()
return None

def set_all(self) -> None:
self.setBOS(bos_token=self._special_tokens_settings.bos_token)
self.setEOS(eos_token=self._special_tokens_settings.eos_token)
self.setPAD(pad_token=self._special_tokens_settings.pad_token)
self.setUNK(unk_token=self._special_tokens_settings.unk_token)
self.setSEP(sep_token=self._special_tokens_settings.sep_token)

def set_custom_tokens(self, tokens: list[str]) -> None:
if self._special_tokens_already_set:
Expand All @@ -73,8 +93,11 @@ def set_custom_tokens(self, tokens: list[str]) -> None:
added_tokens = self._tokenizer.add_special_tokens({'additional_special_tokens': tokens})
assert added_tokens == len(tokens)

def setup_model_config(self, model: PreTrainedModel):
model.config.eos_token_id = self._tokenizer.eos_token_id
def setup_model_config(self, model: PreTrainedModel) -> None:
model.config.bos_token_id = self._tokenizer.bos_token_id
model.config.pad_token_id = self._tokenizer.pad_token_id
model.config.sep_token_id = self._tokenizer.sep_token_id
model.config.eos_token_id = self._tokenizer.eos_token_id

if self._tokenizer.pad_token_id is not None:
model.config.pad_token_id = self._tokenizer.pad_token_id
if self._tokenizer.sep_token_id is not None:
model.config.sep_token_id = self._tokenizer.sep_token_id
2 changes: 1 addition & 1 deletion turbo_alignment/pipelines/train/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ def run(self, experiment_settings: ExperimentSettingsT) -> None:

additional_special_tokens = self._get_additional_special_tokens(experiment_settings)
logger.info(f'Special tokens: {additional_special_tokens}')
special_tokens_setter = SpecialTokensSetter(self.tokenizer)
special_tokens_setter = SpecialTokensSetter(self.tokenizer, experiment_settings.special_tokens_settings)
special_tokens_setter.set_all()
special_tokens_setter.set_custom_tokens(additional_special_tokens)

Expand Down
4 changes: 2 additions & 2 deletions turbo_alignment/pipelines/train/ddpo.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,14 +129,14 @@ def run(self, experiment_settings: DDPOTrainExperimentSettings) -> None:
additional_special_tokens = self._get_additional_special_tokens(experiment_settings)

logger.info(f'Special tokens: {additional_special_tokens}')
special_tokens_setter = SpecialTokensSetter(self.rm_tokenizer)
special_tokens_setter = SpecialTokensSetter(self.rm_tokenizer, experiment_settings.special_tokens_settings)
special_tokens_setter.set_all()
special_tokens_setter.set_custom_tokens(additional_special_tokens)

logger.info('RM Special tokens added!')

logger.info(f'Special tokens: {additional_special_tokens}')
special_tokens_setter = SpecialTokensSetter(self.tokenizer)
special_tokens_setter = SpecialTokensSetter(self.tokenizer, experiment_settings.special_tokens_settings)
special_tokens_setter.set_all()
special_tokens_setter.set_custom_tokens(additional_special_tokens)

Expand Down
3 changes: 3 additions & 0 deletions turbo_alignment/settings/pipelines/train/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
PreTrainedModelSettings,
)
from turbo_alignment.settings.s3 import CheckpointUploaderCallbackParameters
from turbo_alignment.settings.tf.special_tokens_setter import SpecialTokensSettings
from turbo_alignment.settings.tf.tokenizer import TokenizerSettings
from turbo_alignment.settings.tf.trainer import TrainerSettings
from turbo_alignment.settings.weights_and_biases import WandbSettings
Expand All @@ -26,7 +27,9 @@ class BaseTrainExperimentSettings(BaseSettings):
seed: int = 42

trainer_settings: TrainerSettings

tokenizer_settings: TokenizerSettings
special_tokens_settings: SpecialTokensSettings

model_settings: (ModelForPeftSettings | PreTrainedModelSettings | PreTrainedAdaptersModelSettings)

Expand Down
Loading
Loading