From aa0a35ea728502216a6a33d5ba3d8d7e8c8c83d2 Mon Sep 17 00:00:00 2001
From: Silver <zhengyinhe1@163.com>
Date: Thu, 14 Mar 2024 21:34:40 +0800
Subject: [PATCH] Check if the given token is a string (#745)

Some model use bytes as their tokens, such as Qwen (see:
https://huggingface.co/Qwen/Qwen-7B/blob/ef3c5c9c57b252f3149c1408daf4d649ec8b6c85/tokenization_qwen.py#L136
)
---
 outlines/integrations/utils.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/outlines/integrations/utils.py b/outlines/integrations/utils.py
index 3c92428c8..9ac4e2a4f 100644
--- a/outlines/integrations/utils.py
+++ b/outlines/integrations/utils.py
@@ -52,11 +52,15 @@ def adapt_tokenizer(tokenizer: PreTrainedTokenizerBase) -> PreTrainedTokenizerBa
     tokenizer.vocabulary = tokenizer.get_vocab()
     tokenizer.special_tokens = set(tokenizer.all_special_tokens)
 
-    def convert_token_to_string(token: str) -> str:
+    def convert_token_to_string(token: Union[str, bytes]) -> str:
         string = tokenizer.convert_tokens_to_string([token])
 
         # A hack to handle missing spaces to HF's Llama tokenizers
-        if token.startswith(SPIECE_UNDERLINE) or token == "<0x20>":
+        if (
+            type(token) is str
+            and token.startswith(SPIECE_UNDERLINE)
+            or token == "<0x20>"
+        ):
             return " " + string
 
         return string