tiktoken chat formatting (#784)

mosaicml · Dec 8, 2023 · ef60e8e · ef60e8e
1 parent 454faa8
commit ef60e8e
Show file tree

Hide file tree

Showing 2 changed files with 102 additions and 21 deletions.
diff --git a/llmfoundry/tokenizers/tiktoken.py b/llmfoundry/tokenizers/tiktoken.py
@@ -173,12 +173,30 @@ def default_chat_template(self):
         Pinning default Chat ML template in case defaults change.
         """
         template = (
-            "{% set system_message = '' %}"
-            '{% if USE_DEFAULT_PROMPT == true %}'
-            "{{'<|im_start|>system\n' + 'DEFAULT_SYSTEM_PROMPT'}}"
+            "{% if messages[0]['role'] == 'system' %}"
+            '{% set loop_messages = messages[1:] %}'
+            "{% set system_message = messages[0]['content'] %}"
+            "{% elif USE_DEFAULT_PROMPT == true and not 'system' in messages[0]['role'] %}"
+            '{% set loop_messages = messages %}'
+            "{% set system_message = 'DEFAULT_SYSTEM_PROMPT' %}"
+            '{% else %}'
+            '{% set loop_messages = messages %}'
+            '{% set system_message = false %}'
+            '{% endif %}'
+            '{% for message in loop_messages %}'
+            '{% if loop.index0 == 0 %}'
+            '{% if system_message != false %}'
+            "{{ '<|im_start|>system\n' + system_message.strip() + '\n'}}"
+            '{% endif %}'
+            "{{ '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}"
+            '{% else %}'
+            "{{ '\n' + '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}"
+            '{% endif %}'
+            '{% if (add_generation_prompt == true) %}'
+            "{{ '\n' + '<|im_start|>' + 'assistant' + '\n' }}"
+            "{% elif (message['role'] == 'assistant') %}"
+            '{{ eos_token }}'
             '{% endif %}'
-            '{% for message in messages %}'
-            "{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}"
             '{% endfor %}')
         template = template.replace(
             'USE_DEFAULT_PROMPT',

diff --git a/tests/tokenizers/test_tiktoken.py b/tests/tokenizers/test_tiktoken.py
@@ -39,28 +39,88 @@
     ('gpt2', None),
 ]
 
-MULTI_TURN_CHAT_ML = [[{
+DEFAULT_SYSTEM_PROMPT = """<|im_start|>system\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible."""
+
+MULTI_TURN_CHAT_ML = [
+    [{
+        'content':
+            'Please summarize the goals in this text:\n\nGoing outside has benefits include reducing stress and triggering the relaxation response, which can help us not only feel better mentally, but even heal faster from physical ailments.',
+        'role':
+            'user'
+    }, {
+        'content': 'You should go outside and touch grass.',
+        'role': 'assistant'
+    }],
+    [{
+        'content':
+            'You are a honest and helpful AI language model. Tell the user the truth, the whole truth, and nothing but the truth.',
+        'role':
+            'system'
+    }, {
+        'content':
+            'Please summarize the goals in this text:\n\nGoing outside has benefits include reducing stress and triggering the relaxation response, which can help us not only feel better mentally, but even heal faster from physical ailments.',
+        'role':
+            'user'
+    }, {
+        'content': 'You should go outside and touch grass.',
+        'role': 'assistant'
+    }]
+]
+
+MULTI_TURN_CHAT_STRING_NO_SYSTEM_PROMPT = [
+    """<|im_start|>user
+Please summarize the goals in this text:
+
+Going outside has benefits include reducing stress and triggering the relaxation response, which can help us not only feel better mentally, but even heal faster from physical ailments.<|im_end|>
+<|im_start|>assistant
+You should go outside and touch grass.<|im_end|><|endoftext|>""",
+    """<|im_start|>system
+You are a honest and helpful AI language model. Tell the user the truth, the whole truth, and nothing but the truth.
+<|im_start|>user
+Please summarize the goals in this text:
+
+Going outside has benefits include reducing stress and triggering the relaxation response, which can help us not only feel better mentally, but even heal faster from physical ailments.<|im_end|>
+<|im_start|>assistant
+You should go outside and touch grass.<|im_end|><|endoftext|>"""
+]
+
+MULTI_TURN_CHAT_STRING_SYSTEM_PROMPT = [
+    """<|im_start|>system
+You are a helpful, respectful and honest assistant. Always answer as helpfully as possible.
+<|im_start|>user
+Please summarize the goals in this text:
+
+Going outside has benefits include reducing stress and triggering the relaxation response, which can help us not only feel better mentally, but even heal faster from physical ailments.<|im_end|>
+<|im_start|>assistant
+You should go outside and touch grass.<|im_end|><|endoftext|>""",
+    """<|im_start|>system
+You are a honest and helpful AI language model. Tell the user the truth, the whole truth, and nothing but the truth.
+<|im_start|>user
+Please summarize the goals in this text:
+
+Going outside has benefits include reducing stress and triggering the relaxation response, which can help us not only feel better mentally, but even heal faster from physical ailments.<|im_end|>
+<|im_start|>assistant
+You should go outside and touch grass.<|im_end|><|endoftext|>"""
+]
+
+MULTI_TURN_GENERATE_CHAT_ML = [[{
     'content':
         'Please summarize the goals in this text:\n\nGoing outside has benefits include reducing stress and triggering the relaxation response, which can help us not only feel better mentally, but even heal faster from physical ailments.',
     'role':
         'user'
-}, {
-    'content': 'You should go outside and touch grass.',
-    'role': 'assistant'
 }]]
 
-MULTI_TURN_CHAT_STRING = [
-    """<|im_start|>user
+MULTI_TURN_GENERATE_STRING = [
+    """<|im_start|>system
+You are a helpful, respectful and honest assistant. Always answer as helpfully as possible.
+<|im_start|>user
 Please summarize the goals in this text:
 
 Going outside has benefits include reducing stress and triggering the relaxation response, which can help us not only feel better mentally, but even heal faster from physical ailments.<|im_end|>
 <|im_start|>assistant
-You should go outside and touch grass.<|im_end|>
 """
 ]
 
-DEFAULT_SYSTEM_PROMPT = """<|im_start|>system\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible."""
-
 
 def get_tokenizers_for_testing(
     model_name: Optional[str],
@@ -306,10 +366,9 @@ def test_chat_formatting(model_name: Optional[str],
         add_eos_token=False,
         additional_special_tokens=special_tokens_to_add)
     for i, dict_chats in enumerate(MULTI_TURN_CHAT_ML):
-        chat_str = wrapped_tokenizer.apply_chat_template(dict_chats,
-                                                         tokenize=False)
-        assert chat_str == MULTI_TURN_CHAT_STRING[i]
-
+        chat_str = wrapped_tokenizer.apply_chat_template(
+            dict_chats, tokenize=False, add_generation_prompt=False)
+        assert chat_str == MULTI_TURN_CHAT_STRING_NO_SYSTEM_PROMPT[i]
     # Using default system prompt.
     wrapped_tokenizer, _, _ = get_tokenizers_for_testing(
         model_name,
@@ -320,6 +379,10 @@ def test_chat_formatting(model_name: Optional[str],
         add_eos_token=False,
         additional_special_tokens=special_tokens_to_add)
     for i, dict_chats in enumerate(MULTI_TURN_CHAT_ML):
-        chat_str = wrapped_tokenizer.apply_chat_template(dict_chats,
-                                                         tokenize=False)
-        assert chat_str == DEFAULT_SYSTEM_PROMPT + MULTI_TURN_CHAT_STRING[i]
+        chat_str = wrapped_tokenizer.apply_chat_template(
+            dict_chats, tokenize=False, add_generation_prompt=False)
+        assert chat_str == MULTI_TURN_CHAT_STRING_SYSTEM_PROMPT[i]
+    for i, dict_chats in enumerate(MULTI_TURN_GENERATE_CHAT_ML):
+        chat_str = wrapped_tokenizer.apply_chat_template(
+            dict_chats, tokenize=False, add_generation_prompt=True)
+        assert chat_str == MULTI_TURN_GENERATE_STRING[i]