diff --git a/llmfoundry/tokenizers/tiktoken.py b/llmfoundry/tokenizers/tiktoken.py index 6110f565df..fa0e148ff3 100644 --- a/llmfoundry/tokenizers/tiktoken.py +++ b/llmfoundry/tokenizers/tiktoken.py @@ -173,12 +173,30 @@ def default_chat_template(self): Pinning default Chat ML template in case defaults change. """ template = ( - "{% set system_message = '' %}" - '{% if USE_DEFAULT_PROMPT == true %}' - "{{'<|im_start|>system\n' + 'DEFAULT_SYSTEM_PROMPT'}}" + "{% if messages[0]['role'] == 'system' %}" + '{% set loop_messages = messages[1:] %}' + "{% set system_message = messages[0]['content'] %}" + "{% elif USE_DEFAULT_PROMPT == true and not 'system' in messages[0]['role'] %}" + '{% set loop_messages = messages %}' + "{% set system_message = 'DEFAULT_SYSTEM_PROMPT' %}" + '{% else %}' + '{% set loop_messages = messages %}' + '{% set system_message = false %}' + '{% endif %}' + '{% for message in loop_messages %}' + '{% if loop.index0 == 0 %}' + '{% if system_message != false %}' + "{{ '<|im_start|>system\n' + system_message.strip() + '\n'}}" + '{% endif %}' + "{{ '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}" + '{% else %}' + "{{ '\n' + '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}" + '{% endif %}' + '{% if (add_generation_prompt == true) %}' + "{{ '\n' + '<|im_start|>' + 'assistant' + '\n' }}" + "{% elif (message['role'] == 'assistant') %}" + '{{ eos_token }}' '{% endif %}' - '{% for message in messages %}' - "{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}" '{% endfor %}') template = template.replace( 'USE_DEFAULT_PROMPT', diff --git a/tests/tokenizers/test_tiktoken.py b/tests/tokenizers/test_tiktoken.py index 60907092c8..1ade2ea156 100644 --- a/tests/tokenizers/test_tiktoken.py +++ b/tests/tokenizers/test_tiktoken.py @@ -39,28 +39,88 @@ ('gpt2', None), ] -MULTI_TURN_CHAT_ML = [[{ +DEFAULT_SYSTEM_PROMPT = """<|im_start|>system\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible.""" + +MULTI_TURN_CHAT_ML = [ + [{ + 'content': + 'Please summarize the goals in this text:\n\nGoing outside has benefits include reducing stress and triggering the relaxation response, which can help us not only feel better mentally, but even heal faster from physical ailments.', + 'role': + 'user' + }, { + 'content': 'You should go outside and touch grass.', + 'role': 'assistant' + }], + [{ + 'content': + 'You are a honest and helpful AI language model. Tell the user the truth, the whole truth, and nothing but the truth.', + 'role': + 'system' + }, { + 'content': + 'Please summarize the goals in this text:\n\nGoing outside has benefits include reducing stress and triggering the relaxation response, which can help us not only feel better mentally, but even heal faster from physical ailments.', + 'role': + 'user' + }, { + 'content': 'You should go outside and touch grass.', + 'role': 'assistant' + }] +] + +MULTI_TURN_CHAT_STRING_NO_SYSTEM_PROMPT = [ + """<|im_start|>user +Please summarize the goals in this text: + +Going outside has benefits include reducing stress and triggering the relaxation response, which can help us not only feel better mentally, but even heal faster from physical ailments.<|im_end|> +<|im_start|>assistant +You should go outside and touch grass.<|im_end|><|endoftext|>""", + """<|im_start|>system +You are a honest and helpful AI language model. Tell the user the truth, the whole truth, and nothing but the truth. +<|im_start|>user +Please summarize the goals in this text: + +Going outside has benefits include reducing stress and triggering the relaxation response, which can help us not only feel better mentally, but even heal faster from physical ailments.<|im_end|> +<|im_start|>assistant +You should go outside and touch grass.<|im_end|><|endoftext|>""" +] + +MULTI_TURN_CHAT_STRING_SYSTEM_PROMPT = [ + """<|im_start|>system +You are a helpful, respectful and honest assistant. Always answer as helpfully as possible. +<|im_start|>user +Please summarize the goals in this text: + +Going outside has benefits include reducing stress and triggering the relaxation response, which can help us not only feel better mentally, but even heal faster from physical ailments.<|im_end|> +<|im_start|>assistant +You should go outside and touch grass.<|im_end|><|endoftext|>""", + """<|im_start|>system +You are a honest and helpful AI language model. Tell the user the truth, the whole truth, and nothing but the truth. +<|im_start|>user +Please summarize the goals in this text: + +Going outside has benefits include reducing stress and triggering the relaxation response, which can help us not only feel better mentally, but even heal faster from physical ailments.<|im_end|> +<|im_start|>assistant +You should go outside and touch grass.<|im_end|><|endoftext|>""" +] + +MULTI_TURN_GENERATE_CHAT_ML = [[{ 'content': 'Please summarize the goals in this text:\n\nGoing outside has benefits include reducing stress and triggering the relaxation response, which can help us not only feel better mentally, but even heal faster from physical ailments.', 'role': 'user' -}, { - 'content': 'You should go outside and touch grass.', - 'role': 'assistant' }]] -MULTI_TURN_CHAT_STRING = [ - """<|im_start|>user +MULTI_TURN_GENERATE_STRING = [ + """<|im_start|>system +You are a helpful, respectful and honest assistant. Always answer as helpfully as possible. +<|im_start|>user Please summarize the goals in this text: Going outside has benefits include reducing stress and triggering the relaxation response, which can help us not only feel better mentally, but even heal faster from physical ailments.<|im_end|> <|im_start|>assistant -You should go outside and touch grass.<|im_end|> """ ] -DEFAULT_SYSTEM_PROMPT = """<|im_start|>system\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible.""" - def get_tokenizers_for_testing( model_name: Optional[str], @@ -306,10 +366,9 @@ def test_chat_formatting(model_name: Optional[str], add_eos_token=False, additional_special_tokens=special_tokens_to_add) for i, dict_chats in enumerate(MULTI_TURN_CHAT_ML): - chat_str = wrapped_tokenizer.apply_chat_template(dict_chats, - tokenize=False) - assert chat_str == MULTI_TURN_CHAT_STRING[i] - + chat_str = wrapped_tokenizer.apply_chat_template( + dict_chats, tokenize=False, add_generation_prompt=False) + assert chat_str == MULTI_TURN_CHAT_STRING_NO_SYSTEM_PROMPT[i] # Using default system prompt. wrapped_tokenizer, _, _ = get_tokenizers_for_testing( model_name, @@ -320,6 +379,10 @@ def test_chat_formatting(model_name: Optional[str], add_eos_token=False, additional_special_tokens=special_tokens_to_add) for i, dict_chats in enumerate(MULTI_TURN_CHAT_ML): - chat_str = wrapped_tokenizer.apply_chat_template(dict_chats, - tokenize=False) - assert chat_str == DEFAULT_SYSTEM_PROMPT + MULTI_TURN_CHAT_STRING[i] + chat_str = wrapped_tokenizer.apply_chat_template( + dict_chats, tokenize=False, add_generation_prompt=False) + assert chat_str == MULTI_TURN_CHAT_STRING_SYSTEM_PROMPT[i] + for i, dict_chats in enumerate(MULTI_TURN_GENERATE_CHAT_ML): + chat_str = wrapped_tokenizer.apply_chat_template( + dict_chats, tokenize=False, add_generation_prompt=True) + assert chat_str == MULTI_TURN_GENERATE_STRING[i]