Skip to content

Commit

Permalink
tiktoken chat formatting (#784)
Browse files Browse the repository at this point in the history
  • Loading branch information
rajammanabrolu authored Dec 8, 2023
1 parent 454faa8 commit ef60e8e
Show file tree
Hide file tree
Showing 2 changed files with 102 additions and 21 deletions.
28 changes: 23 additions & 5 deletions llmfoundry/tokenizers/tiktoken.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,12 +173,30 @@ def default_chat_template(self):
Pinning default Chat ML template in case defaults change.
"""
template = (
"{% set system_message = '' %}"
'{% if USE_DEFAULT_PROMPT == true %}'
"{{'<|im_start|>system\n' + 'DEFAULT_SYSTEM_PROMPT'}}"
"{% if messages[0]['role'] == 'system' %}"
'{% set loop_messages = messages[1:] %}'
"{% set system_message = messages[0]['content'] %}"
"{% elif USE_DEFAULT_PROMPT == true and not 'system' in messages[0]['role'] %}"
'{% set loop_messages = messages %}'
"{% set system_message = 'DEFAULT_SYSTEM_PROMPT' %}"
'{% else %}'
'{% set loop_messages = messages %}'
'{% set system_message = false %}'
'{% endif %}'
'{% for message in loop_messages %}'
'{% if loop.index0 == 0 %}'
'{% if system_message != false %}'
"{{ '<|im_start|>system\n' + system_message.strip() + '\n'}}"
'{% endif %}'
"{{ '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}"
'{% else %}'
"{{ '\n' + '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}"
'{% endif %}'
'{% if (add_generation_prompt == true) %}'
"{{ '\n' + '<|im_start|>' + 'assistant' + '\n' }}"
"{% elif (message['role'] == 'assistant') %}"
'{{ eos_token }}'
'{% endif %}'
'{% for message in messages %}'
"{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}"
'{% endfor %}')
template = template.replace(
'USE_DEFAULT_PROMPT',
Expand Down
95 changes: 79 additions & 16 deletions tests/tokenizers/test_tiktoken.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,28 +39,88 @@
('gpt2', None),
]

MULTI_TURN_CHAT_ML = [[{
DEFAULT_SYSTEM_PROMPT = """<|im_start|>system\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible."""

MULTI_TURN_CHAT_ML = [
[{
'content':
'Please summarize the goals in this text:\n\nGoing outside has benefits include reducing stress and triggering the relaxation response, which can help us not only feel better mentally, but even heal faster from physical ailments.',
'role':
'user'
}, {
'content': 'You should go outside and touch grass.',
'role': 'assistant'
}],
[{
'content':
'You are a honest and helpful AI language model. Tell the user the truth, the whole truth, and nothing but the truth.',
'role':
'system'
}, {
'content':
'Please summarize the goals in this text:\n\nGoing outside has benefits include reducing stress and triggering the relaxation response, which can help us not only feel better mentally, but even heal faster from physical ailments.',
'role':
'user'
}, {
'content': 'You should go outside and touch grass.',
'role': 'assistant'
}]
]

MULTI_TURN_CHAT_STRING_NO_SYSTEM_PROMPT = [
"""<|im_start|>user
Please summarize the goals in this text:
Going outside has benefits include reducing stress and triggering the relaxation response, which can help us not only feel better mentally, but even heal faster from physical ailments.<|im_end|>
<|im_start|>assistant
You should go outside and touch grass.<|im_end|><|endoftext|>""",
"""<|im_start|>system
You are a honest and helpful AI language model. Tell the user the truth, the whole truth, and nothing but the truth.
<|im_start|>user
Please summarize the goals in this text:
Going outside has benefits include reducing stress and triggering the relaxation response, which can help us not only feel better mentally, but even heal faster from physical ailments.<|im_end|>
<|im_start|>assistant
You should go outside and touch grass.<|im_end|><|endoftext|>"""
]

MULTI_TURN_CHAT_STRING_SYSTEM_PROMPT = [
"""<|im_start|>system
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible.
<|im_start|>user
Please summarize the goals in this text:
Going outside has benefits include reducing stress and triggering the relaxation response, which can help us not only feel better mentally, but even heal faster from physical ailments.<|im_end|>
<|im_start|>assistant
You should go outside and touch grass.<|im_end|><|endoftext|>""",
"""<|im_start|>system
You are a honest and helpful AI language model. Tell the user the truth, the whole truth, and nothing but the truth.
<|im_start|>user
Please summarize the goals in this text:
Going outside has benefits include reducing stress and triggering the relaxation response, which can help us not only feel better mentally, but even heal faster from physical ailments.<|im_end|>
<|im_start|>assistant
You should go outside and touch grass.<|im_end|><|endoftext|>"""
]

MULTI_TURN_GENERATE_CHAT_ML = [[{
'content':
'Please summarize the goals in this text:\n\nGoing outside has benefits include reducing stress and triggering the relaxation response, which can help us not only feel better mentally, but even heal faster from physical ailments.',
'role':
'user'
}, {
'content': 'You should go outside and touch grass.',
'role': 'assistant'
}]]

MULTI_TURN_CHAT_STRING = [
"""<|im_start|>user
MULTI_TURN_GENERATE_STRING = [
"""<|im_start|>system
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible.
<|im_start|>user
Please summarize the goals in this text:
Going outside has benefits include reducing stress and triggering the relaxation response, which can help us not only feel better mentally, but even heal faster from physical ailments.<|im_end|>
<|im_start|>assistant
You should go outside and touch grass.<|im_end|>
"""
]

DEFAULT_SYSTEM_PROMPT = """<|im_start|>system\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible."""


def get_tokenizers_for_testing(
model_name: Optional[str],
Expand Down Expand Up @@ -306,10 +366,9 @@ def test_chat_formatting(model_name: Optional[str],
add_eos_token=False,
additional_special_tokens=special_tokens_to_add)
for i, dict_chats in enumerate(MULTI_TURN_CHAT_ML):
chat_str = wrapped_tokenizer.apply_chat_template(dict_chats,
tokenize=False)
assert chat_str == MULTI_TURN_CHAT_STRING[i]

chat_str = wrapped_tokenizer.apply_chat_template(
dict_chats, tokenize=False, add_generation_prompt=False)
assert chat_str == MULTI_TURN_CHAT_STRING_NO_SYSTEM_PROMPT[i]
# Using default system prompt.
wrapped_tokenizer, _, _ = get_tokenizers_for_testing(
model_name,
Expand All @@ -320,6 +379,10 @@ def test_chat_formatting(model_name: Optional[str],
add_eos_token=False,
additional_special_tokens=special_tokens_to_add)
for i, dict_chats in enumerate(MULTI_TURN_CHAT_ML):
chat_str = wrapped_tokenizer.apply_chat_template(dict_chats,
tokenize=False)
assert chat_str == DEFAULT_SYSTEM_PROMPT + MULTI_TURN_CHAT_STRING[i]
chat_str = wrapped_tokenizer.apply_chat_template(
dict_chats, tokenize=False, add_generation_prompt=False)
assert chat_str == MULTI_TURN_CHAT_STRING_SYSTEM_PROMPT[i]
for i, dict_chats in enumerate(MULTI_TURN_GENERATE_CHAT_ML):
chat_str = wrapped_tokenizer.apply_chat_template(
dict_chats, tokenize=False, add_generation_prompt=True)
assert chat_str == MULTI_TURN_GENERATE_STRING[i]

0 comments on commit ef60e8e

Please sign in to comment.