From a64ffa593c04328f57378dff0652f33f11717650 Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Wed, 19 Jun 2024 14:52:24 -0400
Subject: [PATCH 1/9] First pass chat template (backend only)

Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com>
---
 .../chat_templates/ibm_generic_tmpl.py        | 23 +++++++++++
 .../training/chat_templates/mistral_tmpl.py   | 23 +++++++++++
 src/instructlab/training/data_process.py      | 14 +------
 src/instructlab/training/tokenizer_utils.py   | 40 ++++++++-----------
 4 files changed, 64 insertions(+), 36 deletions(-)
 create mode 100644 src/instructlab/training/chat_templates/ibm_generic_tmpl.py
 create mode 100644 src/instructlab/training/chat_templates/mistral_tmpl.py
diff --git a/src/instructlab/training/chat_templates/ibm_generic_tmpl.py b/src/instructlab/training/chat_templates/ibm_generic_tmpl.py
new file mode 100644
index 00000000..ba54e68d
--- /dev/null
+++ b/src/instructlab/training/chat_templates/ibm_generic_tmpl.py
@@ -0,0 +1,23 @@
+from tokenizer_utils import SpecialTokens
+
+SPECIAL_TOKENS = SpecialTokens(
+    system="<|system|>",
+    user="<|user|>",
+    assistant="<|assistant|>",
+    eos="<|endoftext|>",
+    pad="<|pad|>"
+)
+
+CHAT_TEMPLATE = (
+    "{% for message in messages %}"
+    "{% if message['role'] == 'pretraining' %}"
+    "{{'<|endoftext|>' + message['content'] + '<|endoftext|>'}}"
+    "{% elif message['role'] == 'system' %}"
+    "{{'<|system|>'+ '\n' + message['content'] + '\n'}}"
+    "{% elif message['role'] == 'user' %}"
+    "{{'<|user|>' + '\n' + message['content'] + '\n'}}"
+    "{% elif message['role'] == 'assistant' %}"
+    "{{'<|assistant|>' + '\n' + message['content'] + '<|endoftext|>' + ('' if loop.last else '\n')}}"
+    "{% endif %}"
+    "{% endfor %}"
+)
\ No newline at end of file
diff --git a/src/instructlab/training/chat_templates/mistral_tmpl.py b/src/instructlab/training/chat_templates/mistral_tmpl.py
new file mode 100644
index 00000000..ba85e93d
--- /dev/null
+++ b/src/instructlab/training/chat_templates/mistral_tmpl.py
@@ -0,0 +1,23 @@
+from tokenizer_utils import SpecialTokens
+
+SPECIAL_TOKENS = SpecialTokens(
+    bos="<s>",
+    eos="</s>",
+    user="[INST]",
+    assistant="[/INST]",
+    
+
+)
+
+CHAT_TEMPLATE = (
+    "{{ '<s>' }}"
+    "{% for message in messages %}"
+    "{% if message['role'] == 'pretraining' %}"
+    "{{ message['content'] + '</s>' }}"
+    "{% elif message['role'] == 'user' %}"
+    "{{ '[INST] ' + message['content'] + ' [/INST]' }}"
+    "{% elif message['role'] == 'assistant' %}"
+    "{{ message['content'] + '</s>'}}"
+    "{% endif %}"
+    "{% endfor %}"
+)
diff --git a/src/instructlab/training/data_process.py b/src/instructlab/training/data_process.py
index f8f43535..9ea51a38 100644
--- a/src/instructlab/training/data_process.py
+++ b/src/instructlab/training/data_process.py
@@ -37,18 +37,6 @@ def check_valid_sample(
     if not any(token in whole_sentence_tk for token in special_tokens):
         return True
 
-    # first token should be system_token
-    if whole_sentence_tk[0] != system_tk:
-        print("\033[91mfirst token is not a system_token\033[0m")
-        log_rank_0(tokenizer.decode(whole_sentence_tk), to_print=True)
-        return False
-
-    # check there's only one system_token
-    if (np.array(whole_sentence_tk) == system_tk).sum() != 1:
-        print("\033[91mthere are more than one system_token\033[0m")
-        log_rank_0(tokenizer.decode(whole_sentence_tk), to_print=True)
-        return False
-
     whole_sentence_tk = np.array(whole_sentence_tk)
     user_token_index = (whole_sentence_tk == user_tk).nonzero()[0]
     assistant_token_index = (whole_sentence_tk == assistant_tk).nonzero()[0]
@@ -121,7 +109,7 @@ def unmask_only_assistant_responses(
     whole_sentence = chosen_token["input_ids"][:sentence_legth].clone()
 
     # pre-training mode
-    if system_tk not in whole_sentence:
+    if not (system_tk in whole_sentence or user_token in whole_sentence or assist_token in whole_sentence):
         return labels
 
     labels[:sentence_legth] = -100
diff --git a/src/instructlab/training/tokenizer_utils.py b/src/instructlab/training/tokenizer_utils.py
index 7eff0e69..c7e195ac 100644
--- a/src/instructlab/training/tokenizer_utils.py
+++ b/src/instructlab/training/tokenizer_utils.py
@@ -10,44 +10,38 @@
 
 @dataclass
 class SpecialTokens:
-    system: str = field(default="<|system|>")
+    system: str = field(default=None)
     user: str = field(default="<|user|>")
     assistant: str = field(default="<|assistant|>")
     eos: str = field(default="<|endoftext|>")
-    pad: str = field(default="<|pad|>")
+    pad: str = field(default=None)
+    bos: str = field(default="<|begginingoftext|>")
 
 
-SPECIAL_TOKENS = SpecialTokens()
-
-CHAT_TEMPLATE = (
-    "{% for message in messages %}"
-    "{% if message['role'] == 'pretraining' %}"
-    "{{'<|endoftext|>' + message['content'] + '<|endoftext|>'}}"
-    "{% elif message['role'] == 'system' %}"
-    "{{'<|system|>'+ '\n' + message['content'] + '\n'}}"
-    "{% elif message['role'] == 'user' %}"
-    "{{'<|user|>' + '\n' + message['content'] + '\n'}}"
-    "{% elif message['role'] == 'assistant' %}"
-    "{{'<|assistant|>' + '\n' + message['content'] + '<|endoftext|>' + ('' if loop.last else '\n')}}"
-    "{% endif %}"
-    "{% endfor %}"
-)
+#TODO: Replace with specified template path
+from instructlab.training.chat_templates.ibm_generic_tmpl import SPECIAL_TOKENS, CHAT_TEMPLATE
 
 
 def setup_tokenizer(
     model_name_or_path, SPECIAL_TOKENS=SPECIAL_TOKENS, CHAT_TEMPLATE=CHAT_TEMPLATE
 ):
     tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, fast_tokenizer=True)
+
+    if not SPECIAL_TOKENS.pad:
+        SPECIAL_TOKENS.pad = SPECIAL_TOKENS.eos
     tokenizer.add_special_tokens(
-        {"eos_token": SPECIAL_TOKENS.eos, "pad_token": SPECIAL_TOKENS.pad}
+        {"bos_token": SPECIAL_TOKENS.bos, "eos_token": SPECIAL_TOKENS.eos, "pad_token": SPECIAL_TOKENS.pad}
     )
+
+    if SPECIAL_TOKENS.system:
+        add_token_list = [SPECIAL_TOKENS.system]
+    else:
+        add_token_list = []
+    add_token_list.extend([SPECIAL_TOKENS.user, SPECIAL_TOKENS.assistant])
+
     tokenizer.add_special_tokens(
         {
-            "additional_special_tokens": [
-                SPECIAL_TOKENS.system,
-                SPECIAL_TOKENS.user,
-                SPECIAL_TOKENS.assistant,
-            ]
+            "additional_special_tokens": add_token_list
         }
     )
     if getattr(tokenizer, "add_bos_token", False) or getattr(

From fa973a0f84e46256f5f96b5d833c8c6535311506 Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Wed, 19 Jun 2024 15:32:13 -0400
Subject: [PATCH 2/9] Module path importing

Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com>
---
 src/instructlab/training/tokenizer_utils.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/instructlab/training/tokenizer_utils.py b/src/instructlab/training/tokenizer_utils.py
index c7e195ac..de6d1031 100644
--- a/src/instructlab/training/tokenizer_utils.py
+++ b/src/instructlab/training/tokenizer_utils.py
@@ -19,8 +19,15 @@ class SpecialTokens:
 
 
 #TODO: Replace with specified template path
-from instructlab.training.chat_templates.ibm_generic_tmpl import SPECIAL_TOKENS, CHAT_TEMPLATE
-
+#from instructlab.training.chat_templates.ibm_generic_tmpl import SPECIAL_TOKENS, CHAT_TEMPLATE
+import importlib.util
+import sys
+spec = importlib.util.spec_from_file_location("ibm_generic_tmpl", "chat_templates/ibm_generic_tmpl.py")
+module = importlib.util.module_from_spec(spec)
+sys.modules["ibm_generic_tmpl"] = module
+spec.loader.exec_module(module)
+SPECIAL_TOKENS = module.SPECIAL_TOKENS
+CHAT_TEMPLATE = module.CHAT_TEMPLATE
 
 def setup_tokenizer(
     model_name_or_path, SPECIAL_TOKENS=SPECIAL_TOKENS, CHAT_TEMPLATE=CHAT_TEMPLATE

From 0a699546ee88e8db1d25f17b112a1f0e4cd7a776 Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Thu, 20 Jun 2024 11:30:38 -0400
Subject: [PATCH 3/9] Fix missing sys handling

Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com>
---
 src/instructlab/training/data_process.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/instructlab/training/data_process.py b/src/instructlab/training/data_process.py
index 9ea51a38..2cac9c73 100644
--- a/src/instructlab/training/data_process.py
+++ b/src/instructlab/training/data_process.py
@@ -196,7 +196,10 @@ def main(args: DataProcessArgs):
 
     eos_tk = get_sp_token(tokenizer, SPECIAL_TOKENS.eos)
     pad_tk = get_sp_token(tokenizer, SPECIAL_TOKENS.pad)
-    system_tk = get_sp_token(tokenizer, SPECIAL_TOKENS.system)
+    if SPECIAL_TOKENS.system:
+        system_tk = get_sp_token(tokenizer, SPECIAL_TOKENS.system)
+    else:
+        system_tk = None
     user_tk = get_sp_token(tokenizer, SPECIAL_TOKENS.user)
     assistant_tk = get_sp_token(tokenizer, SPECIAL_TOKENS.assistant)
     log_rank_0(

From b1b11ebe136303a6f367f2ecef718482719e4ebd Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Thu, 20 Jun 2024 16:10:45 -0400
Subject: [PATCH 4/9] Connecting chat templates to the interface

Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com>
---
 .../chat_templates/ibm_generic_tmpl.py        |  5 ++--
 .../training/chat_templates/mistral_tmpl.py   |  3 +--
 src/instructlab/training/config.py            |  4 +++
 src/instructlab/training/data_process.py      | 21 ++++++++++-----
 src/instructlab/training/main_ds.py           | 11 +++++++-
 src/instructlab/training/tokenizer_utils.py   | 27 +++++--------------
 src/instructlab/training/utils.py             | 14 ++++++++++
 7 files changed, 53 insertions(+), 32 deletions(-)

diff --git a/src/instructlab/training/chat_templates/ibm_generic_tmpl.py b/src/instructlab/training/chat_templates/ibm_generic_tmpl.py
index ba54e68d..6d4b37d7 100644
--- a/src/instructlab/training/chat_templates/ibm_generic_tmpl.py
+++ b/src/instructlab/training/chat_templates/ibm_generic_tmpl.py
@@ -1,3 +1,4 @@
+# Third Party
 from tokenizer_utils import SpecialTokens
 
 SPECIAL_TOKENS = SpecialTokens(
@@ -5,7 +6,7 @@
     user="<|user|>",
     assistant="<|assistant|>",
     eos="<|endoftext|>",
-    pad="<|pad|>"
+    pad="<|pad|>",
 )
 
 CHAT_TEMPLATE = (
@@ -20,4 +21,4 @@
     "{{'<|assistant|>' + '\n' + message['content'] + '<|endoftext|>' + ('' if loop.last else '\n')}}"
     "{% endif %}"
     "{% endfor %}"
-)
\ No newline at end of file
+)
diff --git a/src/instructlab/training/chat_templates/mistral_tmpl.py b/src/instructlab/training/chat_templates/mistral_tmpl.py
index ba85e93d..753d5559 100644
--- a/src/instructlab/training/chat_templates/mistral_tmpl.py
+++ b/src/instructlab/training/chat_templates/mistral_tmpl.py
@@ -1,3 +1,4 @@
+# Third Party
 from tokenizer_utils import SpecialTokens
 
 SPECIAL_TOKENS = SpecialTokens(
@@ -5,8 +6,6 @@
     eos="</s>",
     user="[INST]",
     assistant="[/INST]",
-    
-
 )
 
 CHAT_TEMPLATE = (
diff --git a/src/instructlab/training/config.py b/src/instructlab/training/config.py
index c8733a45..8b93485d 100644
--- a/src/instructlab/training/config.py
+++ b/src/instructlab/training/config.py
@@ -42,6 +42,7 @@ class DataProcessArgs(BaseModel):
     data_output_path: str
     max_seq_len: int  # defines the max sequence length of a sample
     model_path: str  # either a HF model name or path to HF model
+    chat_tmpl_path: str
 
     # disable the protected namespace for the model_config field
     model_config = ConfigDict(protected_namespaces=())
@@ -100,6 +101,9 @@ class TrainingArgs(BaseModel):
     # Either the name of a HuggingFace model or a path to a model saved in HuggingFace format.
     model_path: str
 
+    # Specify the chat template / special tokens for training (default is ibm-generic template/tokens)
+    chat_tmpl_path: str = __file__ + "/chat_templates/ibm_generic_tmpl.py"
+
     # this field specifies the filepath to the training dataset before processing
     data_path: str
     ckpt_output_dir: str
diff --git a/src/instructlab/training/data_process.py b/src/instructlab/training/data_process.py
index 2cac9c73..8d70ff34 100644
--- a/src/instructlab/training/data_process.py
+++ b/src/instructlab/training/data_process.py
@@ -10,12 +10,8 @@
 
 # First Party
 from instructlab.training.config import DataProcessArgs
-from instructlab.training.tokenizer_utils import (
-    SPECIAL_TOKENS,
-    get_sp_token,
-    setup_tokenizer,
-)
-from instructlab.training.utils import log_rank_0, setup_logger
+from instructlab.training.tokenizer_utils import get_sp_token, setup_tokenizer
+from instructlab.training.utils import log_rank_0, retrieve_chat_template, setup_logger
 
 
 def check_valid_sample(
@@ -109,7 +105,11 @@ def unmask_only_assistant_responses(
     whole_sentence = chosen_token["input_ids"][:sentence_legth].clone()
 
     # pre-training mode
-    if not (system_tk in whole_sentence or user_token in whole_sentence or assist_token in whole_sentence):
+    if not (
+        system_tk in whole_sentence
+        or user_token in whole_sentence
+        or assist_token in whole_sentence
+    ):
         return labels
 
     labels[:sentence_legth] = -100
@@ -192,6 +192,7 @@ def remove_pretrain_system_messages(example: dict):
 
 
 def main(args: DataProcessArgs):
+    CHAT_TEMPLATE, SPECIAL_TOKENS = retrieve_chat_template(args.chat_tmpl_path)
     tokenizer = setup_tokenizer(args.model_path)
 
     eos_tk = get_sp_token(tokenizer, SPECIAL_TOKENS.eos)
@@ -300,6 +301,12 @@ def main(args: DataProcessArgs):
     parser.add_argument(
         "--model_name_or_path", type=str, required=True, help="Model name or path"
     )
+    parser.add_argument(
+        "--chat-tmpl-path",
+        type=str,
+        default=f"{__file__}/chat_templates/ibm_generic_tmpl.py",
+        help="Path to desired chat template and special tokens, defaults to IBM generic.",
+    )
     args = parser.parse_args()
     setup_logger(args.logging_level)
     data_process_args = DataProcessArgs(
diff --git a/src/instructlab/training/main_ds.py b/src/instructlab/training/main_ds.py
index eeb0c077..08702549 100644
--- a/src/instructlab/training/main_ds.py
+++ b/src/instructlab/training/main_ds.py
@@ -37,6 +37,7 @@
     patch_target_module,
     prepare_peft_model,
     prepare_universal_checkpoint_from_latest,
+    retrieve_chat_template,
     save_hf_format_ds,
     save_model_ds_native,
     set_random_seed,
@@ -438,7 +439,8 @@ def main(args):
         print(f"\033[38;5;120m{yaml.dump(vars(args), sort_keys=False)}\033[0m")
 
     setup_logger(args.log_level)
-    tokenizer = setup_tokenizer(args.model_name_or_path)
+    CHAT_TEMPLATE, SPECIAL_TOKENS = retrieve_chat_template(args.chat_tmpl_path)
+    tokenizer = setup_tokenizer(args.model_name_or_path, CHAT_TEMPLATE, SPECIAL_TOKENS)
     # device = torch.device("cuda", args.local_rank)
 
     #### distributed init #####
@@ -522,6 +524,7 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs):
             model_path=train_args.model_path,
             data_path=train_args.data_path,
             max_seq_len=train_args.max_seq_len,
+            chat_tmpl_path=train_args.chat_tmpl_path,
         )
     )
 
@@ -546,6 +549,7 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs):
         f"--log_level=INFO",
         f"--max_batch_len={train_args.max_batch_len}",
         f"--seed={train_args.random_seed}",
+        f"--chat-tmpl-path={train_args.chat_tmpl_path}",
     ]
 
     if train_args.mock_data:
@@ -644,6 +648,11 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs):
         help="Offload optimizer to CPU when using DeepSpeed. This configures it to use ZeRO stage 2.",
     )
     parser.add_argument("--NEFTune_alpha", type=float, default=None)
+    parser.add_argument(
+        "--chat-tmpl-path",
+        type=str,
+        default=f"{__file__}/chat_templates/ibm_generic_tmpl.py",
+    )
     args = parser.parse_args()
     set_random_seed(args.seed)
     main(args)
diff --git a/src/instructlab/training/tokenizer_utils.py b/src/instructlab/training/tokenizer_utils.py
index de6d1031..5c789441 100644
--- a/src/instructlab/training/tokenizer_utils.py
+++ b/src/instructlab/training/tokenizer_utils.py
@@ -18,26 +18,17 @@ class SpecialTokens:
     bos: str = field(default="<|begginingoftext|>")
 
 
-#TODO: Replace with specified template path
-#from instructlab.training.chat_templates.ibm_generic_tmpl import SPECIAL_TOKENS, CHAT_TEMPLATE
-import importlib.util
-import sys
-spec = importlib.util.spec_from_file_location("ibm_generic_tmpl", "chat_templates/ibm_generic_tmpl.py")
-module = importlib.util.module_from_spec(spec)
-sys.modules["ibm_generic_tmpl"] = module
-spec.loader.exec_module(module)
-SPECIAL_TOKENS = module.SPECIAL_TOKENS
-CHAT_TEMPLATE = module.CHAT_TEMPLATE
-
-def setup_tokenizer(
-    model_name_or_path, SPECIAL_TOKENS=SPECIAL_TOKENS, CHAT_TEMPLATE=CHAT_TEMPLATE
-):
+def setup_tokenizer(model_name_or_path, SPECIAL_TOKENS, CHAT_TEMPLATE):
     tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, fast_tokenizer=True)
 
     if not SPECIAL_TOKENS.pad:
         SPECIAL_TOKENS.pad = SPECIAL_TOKENS.eos
     tokenizer.add_special_tokens(
-        {"bos_token": SPECIAL_TOKENS.bos, "eos_token": SPECIAL_TOKENS.eos, "pad_token": SPECIAL_TOKENS.pad}
+        {
+            "bos_token": SPECIAL_TOKENS.bos,
+            "eos_token": SPECIAL_TOKENS.eos,
+            "pad_token": SPECIAL_TOKENS.pad,
+        }
     )
 
     if SPECIAL_TOKENS.system:
@@ -46,11 +37,7 @@ def setup_tokenizer(
         add_token_list = []
     add_token_list.extend([SPECIAL_TOKENS.user, SPECIAL_TOKENS.assistant])
 
-    tokenizer.add_special_tokens(
-        {
-            "additional_special_tokens": add_token_list
-        }
-    )
+    tokenizer.add_special_tokens({"additional_special_tokens": add_token_list})
     if getattr(tokenizer, "add_bos_token", False) or getattr(
         tokenizer, "add_eos_token", False
     ):
diff --git a/src/instructlab/training/utils.py b/src/instructlab/training/utils.py
index 6feaa548..ce935c41 100644
--- a/src/instructlab/training/utils.py
+++ b/src/instructlab/training/utils.py
@@ -24,6 +24,20 @@
 import torch.nn.functional as F
 
 
+def retrieve_chat_template(chat_tmpl_path):
+    # Standard
+    import importlib.util
+    import sys
+
+    spec = importlib.util.spec_from_file_location("spcl_chat_tmpl", chat_tmpl_path)
+    module = importlib.util.module_from_spec(spec)
+    sys.modules["spcl_chat_tmpl"] = module
+    spec.loader.exec_module(module)
+    SPECIAL_TOKENS = module.SPECIAL_TOKENS
+    CHAT_TEMPLATE = module.CHAT_TEMPLATE
+    return CHAT_TEMPLATE, SPECIAL_TOKENS
+
+
 def add_noisy_embeddings(model, noise_alpha=None):
     if not noise_alpha:
         return model

From fbb6af6cd3053fcda8ae95c1f42e6103ce5b397e Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Fri, 21 Jun 2024 11:12:16 -0400
Subject: [PATCH 5/9] Fix file-paths

Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com>
---
 src/instructlab/training/config.py       | 5 ++++-
 src/instructlab/training/data_process.py | 4 +++-
 src/instructlab/training/main_ds.py      | 2 +-
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/instructlab/training/config.py b/src/instructlab/training/config.py
index 8b93485d..b989b18d 100644
--- a/src/instructlab/training/config.py
+++ b/src/instructlab/training/config.py
@@ -4,6 +4,7 @@
 
 # Standard
 from enum import Enum
+import os
 
 # Third Party
 from pydantic import BaseModel, ConfigDict, Field
@@ -102,7 +103,9 @@ class TrainingArgs(BaseModel):
     model_path: str
 
     # Specify the chat template / special tokens for training (default is ibm-generic template/tokens)
-    chat_tmpl_path: str = __file__ + "/chat_templates/ibm_generic_tmpl.py"
+    chat_tmpl_path: str = (
+        os.path.dirname(__file__) + "/chat_templates/ibm_generic_tmpl.py"
+    )
 
     # this field specifies the filepath to the training dataset before processing
     data_path: str
diff --git a/src/instructlab/training/data_process.py b/src/instructlab/training/data_process.py
index 8d70ff34..d6c245f1 100644
--- a/src/instructlab/training/data_process.py
+++ b/src/instructlab/training/data_process.py
@@ -2,6 +2,7 @@
 from pathlib import Path
 from typing import List
 import logging
+import os
 
 # Third Party
 from datasets import load_dataset
@@ -304,7 +305,7 @@ def main(args: DataProcessArgs):
     parser.add_argument(
         "--chat-tmpl-path",
         type=str,
-        default=f"{__file__}/chat_templates/ibm_generic_tmpl.py",
+        default=f"{os.path.dirname(__file__)}/../chat_templates/ibm_generic_tmpl.py",
         help="Path to desired chat template and special tokens, defaults to IBM generic.",
     )
     args = parser.parse_args()
@@ -314,6 +315,7 @@ def main(args: DataProcessArgs):
         data_path=args.data_path,
         max_seq_len=args.max_seq_len,
         model_path=args.model_name_or_path,
+        chat_tmpl_path=args.chat_tmpl_path,
     )
     main(data_process_args)
 
diff --git a/src/instructlab/training/main_ds.py b/src/instructlab/training/main_ds.py
index 08702549..1dedf513 100644
--- a/src/instructlab/training/main_ds.py
+++ b/src/instructlab/training/main_ds.py
@@ -651,7 +651,7 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs):
     parser.add_argument(
         "--chat-tmpl-path",
         type=str,
-        default=f"{__file__}/chat_templates/ibm_generic_tmpl.py",
+        default=f"{os.path.dirname(__file__)}/chat_templates/ibm_generic_tmpl.py",
     )
     args = parser.parse_args()
     set_random_seed(args.seed)

From 07b6662d76b44877692fd9d8e73463401a005f4f Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Fri, 21 Jun 2024 11:18:46 -0400
Subject: [PATCH 6/9] Typo fixes

Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com>
---
 src/instructlab/training/data_process.py | 4 ++--
 src/instructlab/training/main_ds.py      | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/instructlab/training/data_process.py b/src/instructlab/training/data_process.py
index d6c245f1..9bbe0a5f 100644
--- a/src/instructlab/training/data_process.py
+++ b/src/instructlab/training/data_process.py
@@ -194,7 +194,7 @@ def remove_pretrain_system_messages(example: dict):
 
 def main(args: DataProcessArgs):
     CHAT_TEMPLATE, SPECIAL_TOKENS = retrieve_chat_template(args.chat_tmpl_path)
-    tokenizer = setup_tokenizer(args.model_path)
+    tokenizer = setup_tokenizer(args.model_path, SPECIAL_TOKENS, CHAT_TEMPLATE)
 
     eos_tk = get_sp_token(tokenizer, SPECIAL_TOKENS.eos)
     pad_tk = get_sp_token(tokenizer, SPECIAL_TOKENS.pad)
@@ -305,7 +305,7 @@ def main(args: DataProcessArgs):
     parser.add_argument(
         "--chat-tmpl-path",
         type=str,
-        default=f"{os.path.dirname(__file__)}/../chat_templates/ibm_generic_tmpl.py",
+        default=f"{os.path.dirname(__file__)}/chat_templates/ibm_generic_tmpl.py",
         help="Path to desired chat template and special tokens, defaults to IBM generic.",
     )
     args = parser.parse_args()
diff --git a/src/instructlab/training/main_ds.py b/src/instructlab/training/main_ds.py
index 1dedf513..451413fc 100644
--- a/src/instructlab/training/main_ds.py
+++ b/src/instructlab/training/main_ds.py
@@ -440,7 +440,7 @@ def main(args):
 
     setup_logger(args.log_level)
     CHAT_TEMPLATE, SPECIAL_TOKENS = retrieve_chat_template(args.chat_tmpl_path)
-    tokenizer = setup_tokenizer(args.model_name_or_path, CHAT_TEMPLATE, SPECIAL_TOKENS)
+    tokenizer = setup_tokenizer(args.model_name_or_path, SPECIAL_TOKENS, CHAT_TEMPLATE)
     # device = torch.device("cuda", args.local_rank)
 
     #### distributed init #####

From 0db67f64595bb4e08bbaf8af1f3887088a957775 Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Fri, 21 Jun 2024 11:55:13 -0400
Subject: [PATCH 7/9] Fixed template imports

Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com>
---
 .../training/chat_templates/ibm_generic_tmpl.py   |  4 ++--
 .../training/chat_templates/mistral_tmpl.py       |  4 ++--
 src/instructlab/training/utils.py                 | 15 +++++++++------
 3 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/src/instructlab/training/chat_templates/ibm_generic_tmpl.py b/src/instructlab/training/chat_templates/ibm_generic_tmpl.py
index 6d4b37d7..87bfdb0a 100644
--- a/src/instructlab/training/chat_templates/ibm_generic_tmpl.py
+++ b/src/instructlab/training/chat_templates/ibm_generic_tmpl.py
@@ -1,5 +1,5 @@
-# Third Party
-from tokenizer_utils import SpecialTokens
+# First Party
+from instructlab.training.tokenizer_utils import SpecialTokens
 
 SPECIAL_TOKENS = SpecialTokens(
     system="<|system|>",
diff --git a/src/instructlab/training/chat_templates/mistral_tmpl.py b/src/instructlab/training/chat_templates/mistral_tmpl.py
index 753d5559..965823f2 100644
--- a/src/instructlab/training/chat_templates/mistral_tmpl.py
+++ b/src/instructlab/training/chat_templates/mistral_tmpl.py
@@ -1,5 +1,5 @@
-# Third Party
-from tokenizer_utils import SpecialTokens
+# First Party
+from instructlab.training.tokenizer_utils import SpecialTokens
 
 SPECIAL_TOKENS = SpecialTokens(
     bos="<s>",
diff --git a/src/instructlab/training/utils.py b/src/instructlab/training/utils.py
index ce935c41..9e57dc68 100644
--- a/src/instructlab/training/utils.py
+++ b/src/instructlab/training/utils.py
@@ -29,12 +29,15 @@ def retrieve_chat_template(chat_tmpl_path):
     import importlib.util
     import sys
 
-    spec = importlib.util.spec_from_file_location("spcl_chat_tmpl", chat_tmpl_path)
-    module = importlib.util.module_from_spec(spec)
-    sys.modules["spcl_chat_tmpl"] = module
-    spec.loader.exec_module(module)
-    SPECIAL_TOKENS = module.SPECIAL_TOKENS
-    CHAT_TEMPLATE = module.CHAT_TEMPLATE
+    try:
+        spec = importlib.util.spec_from_file_location("spcl_chat_tmpl", chat_tmpl_path)
+        module = importlib.util.module_from_spec(spec)
+        sys.modules["spcl_chat_tmpl"] = module
+        spec.loader.exec_module(module)
+        SPECIAL_TOKENS = module.SPECIAL_TOKENS
+        CHAT_TEMPLATE = module.CHAT_TEMPLATE
+    except:
+        sys.exit(f"Invalid chat template path: {chat_tmpl_path}")
     return CHAT_TEMPLATE, SPECIAL_TOKENS
 
 

From 16b9e88f25c4063fb96023f67e27852e76bf93a3 Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Fri, 21 Jun 2024 14:11:42 -0400
Subject: [PATCH 8/9] Review feedback applied

Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com>
---
 src/instructlab/training/config.py       | 4 ++--
 src/instructlab/training/data_process.py | 4 +++-
 src/instructlab/training/main_ds.py      | 4 +++-
 src/instructlab/training/utils.py        | 4 ----
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/instructlab/training/config.py b/src/instructlab/training/config.py
index b989b18d..7ea000b8 100644
--- a/src/instructlab/training/config.py
+++ b/src/instructlab/training/config.py
@@ -103,8 +103,8 @@ class TrainingArgs(BaseModel):
     model_path: str
 
     # Specify the chat template / special tokens for training (default is ibm-generic template/tokens)
-    chat_tmpl_path: str = (
-        os.path.dirname(__file__) + "/chat_templates/ibm_generic_tmpl.py"
+    chat_tmpl_path: str = os.path.join(
+        os.path.dirname(__file__), "/chat_templates/ibm_generic_tmpl.py"
     )
 
     # this field specifies the filepath to the training dataset before processing
diff --git a/src/instructlab/training/data_process.py b/src/instructlab/training/data_process.py
index 9bbe0a5f..9301d185 100644
--- a/src/instructlab/training/data_process.py
+++ b/src/instructlab/training/data_process.py
@@ -305,7 +305,9 @@ def main(args: DataProcessArgs):
     parser.add_argument(
         "--chat-tmpl-path",
         type=str,
-        default=f"{os.path.dirname(__file__)}/chat_templates/ibm_generic_tmpl.py",
+        default=os.path.join(
+            os.path.dirname(__file__), "chat_templates/ibm_generic_tmpl.py"
+        ),
         help="Path to desired chat template and special tokens, defaults to IBM generic.",
     )
     args = parser.parse_args()
diff --git a/src/instructlab/training/main_ds.py b/src/instructlab/training/main_ds.py
index 451413fc..1eb21a68 100644
--- a/src/instructlab/training/main_ds.py
+++ b/src/instructlab/training/main_ds.py
@@ -651,7 +651,9 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs):
     parser.add_argument(
         "--chat-tmpl-path",
         type=str,
-        default=f"{os.path.dirname(__file__)}/chat_templates/ibm_generic_tmpl.py",
+        default=os.path.join(
+            os.path.dirname(__file__), "chat_templates/ibm_generic_tmpl.py"
+        ),
     )
     args = parser.parse_args()
     set_random_seed(args.seed)
diff --git a/src/instructlab/training/utils.py b/src/instructlab/training/utils.py
index 9e57dc68..5eccb5dc 100644
--- a/src/instructlab/training/utils.py
+++ b/src/instructlab/training/utils.py
@@ -25,10 +25,6 @@
 
 
 def retrieve_chat_template(chat_tmpl_path):
-    # Standard
-    import importlib.util
-    import sys
-
     try:
         spec = importlib.util.spec_from_file_location("spcl_chat_tmpl", chat_tmpl_path)
         module = importlib.util.module_from_spec(spec)

From d0dcef053307720bf85a7f89b6ee762cf7c76353 Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Fri, 21 Jun 2024 14:21:04 -0400
Subject: [PATCH 9/9] Remove extraneous slash

Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com>
---
 src/instructlab/training/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/instructlab/training/config.py b/src/instructlab/training/config.py
index 7ea000b8..83c7a1f8 100644
--- a/src/instructlab/training/config.py
+++ b/src/instructlab/training/config.py
@@ -104,7 +104,7 @@ class TrainingArgs(BaseModel):
 
     # Specify the chat template / special tokens for training (default is ibm-generic template/tokens)
     chat_tmpl_path: str = os.path.join(
-        os.path.dirname(__file__), "/chat_templates/ibm_generic_tmpl.py"
+        os.path.dirname(__file__), "chat_templates/ibm_generic_tmpl.py"
     )
 
     # this field specifies the filepath to the training dataset before processing