Skip to content

Commit

Permalink
add ignore chat dataset flag
Browse files Browse the repository at this point in the history
  • Loading branch information
Малахов Алексей Павлович committed Sep 23, 2024
1 parent 4b0ecc0 commit b25a34d
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 3 deletions.
9 changes: 8 additions & 1 deletion turbo_alignment/dataset/chat/chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,14 @@ def _encode(
inference: bool,
random_cut: bool,
) -> list[dict[str, Any] | None]:
conversations = [Conversation(system_prompt=self.source.system_prompt, messages=r.messages) for r in records]
conversations = [
Conversation(
system_prompt=self.source.system_prompt,
messages=r.messages,
ignore_system_prompt=self.settings.ignore_system_prompt,
)
for r in records
]

logger.info(f'Tokenizing dataset {self.source.name}')
tokenized_replicas = self.__tokenize([m.content for c in conversations for m in c.messages])
Expand Down
9 changes: 7 additions & 2 deletions turbo_alignment/dataset/chat/conversation.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,20 @@ def __init__(
self,
system_prompt: str | None,
messages: list[ChatMessage],
ignore_system_prompt: bool,
):
self._messages: list[ChatMessage] = []

if system_prompt:
self._messages += [ChatMessage(role=ChatMessageRole.SYSTEM, content=system_prompt, disable_loss=True)]
if system_prompt and ignore_system_prompt:
raise ValueError('You can not ignore system_prompt and use system prompt in config simultaneoulsy')

if system_prompt or ignore_system_prompt:
if messages[0].role == ChatMessageRole.SYSTEM:
messages = messages[1:]

if system_prompt:
self._messages += [ChatMessage(role=ChatMessageRole.SYSTEM, content=system_prompt, disable_loss=True)]

self._messages += messages

@property
Expand Down
1 change: 1 addition & 0 deletions turbo_alignment/settings/datasets/chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ class ChatDatasetSettings(BaseDatasetSettings):
keep_end: bool | None = None
max_tokens_count: int | None
prompt_template: ChatPromptTemplate
ignore_system_prompt: bool = False


class ChatMultiDatasetSettings(ChatDatasetSettings, MultiDatasetSettings):
Expand Down

0 comments on commit b25a34d

Please sign in to comment.