Skip to content

Commit

Permalink
fix tests and linters
Browse files Browse the repository at this point in the history
  • Loading branch information
Малахов Алексей Павлович committed Sep 3, 2024
1 parent 23981db commit 452d45a
Show file tree
Hide file tree
Showing 10 changed files with 39 additions and 23 deletions.
5 changes: 5 additions & 0 deletions configs/exp/train/classification/classification.json
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,11 @@
}
},
"tokenizer_settings": {},
"special_tokens_settings": {
"bos_token": "<s>",
"eos_token": "</s>",
"pad_token": "<pad>"
},
"trainer_settings": {
"evaluation_strategy": "steps",
"per_device_train_batch_size": 1,
Expand Down
4 changes: 4 additions & 0 deletions configs/exp/train/dpo/dpo.json
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,10 @@
}
},
"tokenizer_settings": {},
"special_tokens_settings": {
"bos_token": "<|begin_of_text|>",
"eos_token": "<|end_of_text|>"
},
"trainer_settings": {
"evaluation_strategy": "steps",
"per_device_train_batch_size": 1,
Expand Down
4 changes: 4 additions & 0 deletions configs/exp/train/multimodal/c_abs.json
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,10 @@
"tokenizer_settings": {
"tokenizer_path": "/from_s3/model"
},
"special_tokens_settings": {
"bos_token": "<s>",
"eos_token": "</s>"
},
"trainer_settings": {
"evaluation_strategy": "epoch",
"save_strategy": "epoch",
Expand Down
Empty file.
4 changes: 4 additions & 0 deletions configs/exp/train/multimodal/mlp.json
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,10 @@
"tokenizer_settings": {
"tokenizer_path": "/from_s3/model"
},
"special_tokens_settings": {
"bos_token": "<s>",
"eos_token": "</s>"
},
"trainer_settings": {
"evaluation_strategy": "epoch",
"save_strategy": "epoch",
Expand Down
4 changes: 4 additions & 0 deletions configs/exp/train/rag/end2end_rag.json
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,10 @@
"metric_settings": []
},
"tokenizer_settings": {},
"special_tokens_settings": {
"bos_token": "<s>",
"eos_token": "</s>"
},
"trainer_settings": {
"evaluation_strategy": "steps",
"save_strategy": "steps",
Expand Down
4 changes: 4 additions & 0 deletions configs/exp/train/rm/rm.json
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,10 @@
}
},
"tokenizer_settings": {},
"special_tokens_settings": {
"bos_token": "<|begin_of_text|>",
"eos_token": "<|end_of_text|>"
},
"trainer_settings": {
"evaluation_strategy": "steps",
"per_device_train_batch_size": 1,
Expand Down
4 changes: 4 additions & 0 deletions configs/exp/train/sft/sft.json
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,10 @@
]
},
"tokenizer_settings": {},
"special_tokens_settings": {
"bos_token": "<|begin_of_text|>",
"eos_token": "<|end_of_text|>"
},
"trainer_settings": {
"evaluation_strategy": "steps",
"save_total_limit": 5,
Expand Down
29 changes: 8 additions & 21 deletions turbo_alignment/common/tf/special_tokens_setter.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,7 @@ def __init__(self, tokenizer: PreTrainedTokenizerBase, special_tokens_settings:
self._special_tokens_settings = special_tokens_settings
self._special_tokens_already_set: bool = False

def setBOS(self, bos_token: str | None) -> None:
if self._tokenizer.bos_token_id is None and bos_token is None:
logger.info('Skip adding bos_token_id')
return None

def setBOS(self, bos_token: str) -> None:
if self._tokenizer.bos_token_id is None:
logger.info('Model does not have bos_token_id')
self._tokenizer.add_special_tokens(special_tokens_dict={'bos_token': bos_token})
Expand All @@ -26,13 +22,7 @@ def setBOS(self, bos_token: str | None) -> None:
else:
logger.info(f'Model has bos_token_id = {self._tokenizer.bos_token_id}')

return None

def setEOS(self, eos_token: str | None) -> None:
if self._tokenizer.eos_token_id is None and eos_token is None:
logger.info('Skip adding eos_token_id')
return None

def setEOS(self, eos_token: str) -> None:
if self._tokenizer.eos_token_id is None:
logger.info('Model does not have eos_token_id')
self._tokenizer.add_special_tokens(special_tokens_dict={'eos_token': eos_token})
Expand All @@ -41,8 +31,6 @@ def setEOS(self, eos_token: str | None) -> None:
else:
logger.info(f'Model has eos_token_id = {self._tokenizer.eos_token_id}')

return None

def setPAD(self, pad_token: str | None) -> None:
if self._tokenizer.pad_token_id is None and pad_token is None:
logger.info('Skip adding pad_token_id')
Expand Down Expand Up @@ -105,12 +93,11 @@ def set_custom_tokens(self, tokens: list[str]) -> None:
added_tokens = self._tokenizer.add_special_tokens({'additional_special_tokens': tokens})
assert added_tokens == len(tokens)

def setup_model_config(self, model: PreTrainedModel):
if self._tokenizer.bos_token_id is None:
model.config.bos_token_id = self._tokenizer.bos_token_id
if self._tokenizer.eos_token_id is None:
model.config.eos_token_id = self._tokenizer.eos_token_id
if self._tokenizer.pad_token_id is None:
def setup_model_config(self, model: PreTrainedModel) -> None:
model.config.bos_token_id = self._tokenizer.bos_token_id
model.config.eos_token_id = self._tokenizer.eos_token_id

if self._tokenizer.pad_token_id is not None:
model.config.pad_token_id = self._tokenizer.pad_token_id
if self._tokenizer.sep_token_id is None:
if self._tokenizer.sep_token_id is not None:
model.config.sep_token_id = self._tokenizer.sep_token_id
4 changes: 2 additions & 2 deletions turbo_alignment/settings/tf/special_tokens_setter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@


class SpecialTokensSettings(ExtraFieldsNotAllowedBaseModel):
bos_token: str = '<s>'
eos_token: str = '</s>'
bos_token: str
eos_token: str
pad_token: str | None = None
unk_token: str | None = None
sep_token: str | None = None

0 comments on commit 452d45a

Please sign in to comment.