From aa0a35ea728502216a6a33d5ba3d8d7e8c8c83d2 Mon Sep 17 00:00:00 2001 From: Silver Date: Thu, 14 Mar 2024 21:34:40 +0800 Subject: [PATCH] Check if the given token is a string (#745) Some model use bytes as their tokens, such as Qwen (see: https://huggingface.co/Qwen/Qwen-7B/blob/ef3c5c9c57b252f3149c1408daf4d649ec8b6c85/tokenization_qwen.py#L136 ) --- outlines/integrations/utils.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/outlines/integrations/utils.py b/outlines/integrations/utils.py index 3c92428c8..9ac4e2a4f 100644 --- a/outlines/integrations/utils.py +++ b/outlines/integrations/utils.py @@ -52,11 +52,15 @@ def adapt_tokenizer(tokenizer: PreTrainedTokenizerBase) -> PreTrainedTokenizerBa tokenizer.vocabulary = tokenizer.get_vocab() tokenizer.special_tokens = set(tokenizer.all_special_tokens) - def convert_token_to_string(token: str) -> str: + def convert_token_to_string(token: Union[str, bytes]) -> str: string = tokenizer.convert_tokens_to_string([token]) # A hack to handle missing spaces to HF's Llama tokenizers - if token.startswith(SPIECE_UNDERLINE) or token == "<0x20>": + if ( + type(token) is str + and token.startswith(SPIECE_UNDERLINE) + or token == "<0x20>" + ): return " " + string return string