oobabooga · s-kostyaev · May 26, 2023 · May 27, 2023
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # Text generation web UI
 
-A gradio web UI for running Large Language Models like LLaMA, llama.cpp, GPT-J, Pythia, OPT, and GALACTICA.
+A gradio web UI for running Large Language Models like LLaMA, llama.cpp, GPT-J, Pythia, OPT, starcoder and GALACTICA.
 
 Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) of text generation.
 
@@ -26,6 +26,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
 * [LoRA (loading and training)](docs/Using-LoRAs.md)
 * [llama.cpp](docs/llama.cpp-models.md)
 * [RWKV model](docs/RWKV-model.md)
+* [starcoder.cpp](docs/starcoder.cpp-models.md)
 * 8-bit and 4-bit through bitsandbytes
 * Layers splitting across GPU(s), CPU, and disk
 * CPU mode
@@ -252,6 +253,13 @@ Optionally, you can use the following command-line flags:
 | `--n_ctx N_CTX` | Size of the prompt context. |
 | `--llama_cpp_seed SEED` | Seed for llama-cpp models. Default 0 (random). |
 
+#### starcoder.cpp
+
+| Flag        | Description |
+|-------------|-------------|
+| `--threads` | Number of threads to use. |
+| `--n_ctx N_CTX` | Size of the prompt context. |
+
 #### GPTQ
 
 | Flag                      | Description |

diff --git a/docs/starcoder.cpp-models.md b/docs/starcoder.cpp-models.md
@@ -0,0 +1,26 @@
+# Using starcoder.cpp in the web UI
+
+## Setting up the models
+
+#### Pre-converted
+
+Place the model in the `models` folder, making sure that its name
+contains `starcoder` or `starchat` in the beginning, `ggml` somewhere
+in the middle and ends in `.bin`.
+
+You can find converted models here:
+
+- [StarChat Alpha](https://huggingface.co/NeoDim/starchat-alpha-GGML)
+- [StarCoder](https://huggingface.co/NeoDim/starcoder-GGML)
+- [StarCoderBase](https://huggingface.co/NeoDim/starcoderbase-GGML)
+
+#### Convert models yourself
+
+Follow the instructions
+[here](https://github.com/ggerganov/ggml/tree/master/examples/starcoder)
+
+There is also
+[starcoder.cpp](https://github.com/bigcode-project/starcoder.cpp#quantizing-the-models)
+but there is an [issue](https://github.com/bigcode-project/starcoder.cpp/issues/11)
+
+
diff --git a/modules/models.py b/modules/models.py
@@ -47,6 +47,14 @@ def find_model_type(model_name):
     model_name_lower = model_name.lower()
     if re.match('.*rwkv.*\.pth', model_name_lower):
         return 'rwkv'
+    elif len(list(path_to_model.glob('*starcoder*ggml*.bin'))) > 0:
+        return 'starcoder'
+    elif re.match('.*starcoder.*ggml.*\.bin', model_name_lower):
+        return 'starcoder'
+    elif len(list(path_to_model.glob('*starchat*ggml*.bin'))) > 0:
+        return 'starchat'
+    elif re.match('.*starchat.*ggml.*\.bin', model_name_lower):
+        return 'starchat'
     elif len(list(path_to_model.glob('*ggml*.bin'))) > 0:
         return 'llamacpp'
     elif re.match('.*ggml.*\.bin', model_name_lower):
@@ -83,6 +91,10 @@ def load_model(model_name):
         load_func = AutoGPTQ_loader
     elif shared.args.wbits > 0:
         load_func = GPTQ_loader
+    elif shared.model_type == 'starcoder':
+        load_func = starcodercpp_loader
+    elif shared.model_type == 'starchat':
+        load_func = starchatcpp_loader
     elif shared.model_type == 'llamacpp':
         load_func = llamacpp_loader
     elif shared.model_type == 'rwkv':
@@ -273,6 +285,38 @@ def llamacpp_loader(model_name):
     return model, tokenizer
 
 
+def starcodercpp_loader(model_name):
+    from modules.starcoder_model import StarcoderCppModel
+
+    path = Path(f'{shared.args.model_dir}/{model_name}')
+    if path.is_file():
+        model_file = path
+    else:
+        model_file = list(
+            Path(f'{shared.args.model_dir}/{model_name}').glob('*starcoder*ggml*.bin')
+        )[0]
+
+    logger.info(f'starcoder.cpp weights detected: {model_file}\n')
+    model, tokenizer = StarcoderCppModel().from_pretrained(model_file)
+    return model, tokenizer
+
+
+def starchatcpp_loader(model_name):
+    from modules.starcoder_model import StarcoderCppModel
+
+    path = Path(f'{shared.args.model_dir}/{model_name}')
+    if path.is_file():
+        model_file = path
+    else:
+        model_file = list(
+            Path(f'{shared.args.model_dir}/{model_name}').glob('*starchat*ggml*.bin')
+        )[0]
+
+    logger.info(f'starchat.cpp weights detected: {model_file}\n')
+    model, tokenizer = StarcoderCppModel().from_pretrained(model_file)
+    return model, tokenizer
+
+
 def GPTQ_loader(model_name):
 
     # Monkey patch

diff --git a/modules/shared.py b/modules/shared.py
@@ -38,7 +38,7 @@
     'autoload_model': True,
     'max_new_tokens': 200,
     'max_new_tokens_min': 1,
-    'max_new_tokens_max': 2000,
+    'max_new_tokens_max': 8000,
     'seed': -1,
     'character': 'None',
     'name1': 'You',
@@ -60,7 +60,7 @@
     'chat-instruct_command': 'Continue the chat dialogue below. Write a single reply for the character "<|character|>".\n\n<|prompt|>',
     'chat_prompt_size': 2048,
     'chat_prompt_size_min': 0,
-    'chat_prompt_size_max': 2048,
+    'chat_prompt_size_max': 8192,
     'chat_generation_attempts': 1,
     'chat_generation_attempts_min': 1,
     'chat_generation_attempts_max': 10,

diff --git a/modules/starcoder_model.py b/modules/starcoder_model.py
@@ -0,0 +1,81 @@
+from ctransformers import AutoModelForCausalLM
+from ctransformers import AutoConfig
+
+from modules import shared
+
+
+class StarcoderCppModel:
+    def __init__(self):
+        pass
+
+    @classmethod
+    def from_pretrained(self, path):
+        result = self()
+
+        config = AutoConfig.from_pretrained(
+            str(path),
+            stop=["<|end|>"],
+            threads=shared.args.threads,
+        )
+        self.model = AutoModelForCausalLM.from_pretrained(
+            str(path), model_type="starcoder", config=config
+        )
+        return result, result
+
+    def encode(self, string, **kwargs):
+        return self.model.tokenize(string)
+
+    def decode(self, ids):
+        return self.model.detokenize(ids)
+
+    def generate(
+        self,
+        context="",
+        token_count=20,
+        temperature=1,
+        top_p=1,
+        top_k=50,
+        repetition_penalty=1,
+        callback=None,
+    ):
+        context = context if type(context) is str else context.decode()
+        generator = self.model._stream(
+            prompt=context,
+            max_new_tokens=token_count,
+            top_k=top_k,
+            top_p=top_p,
+            temperature=temperature,
+            repetition_penalty=repetition_penalty,
+            threads=shared.args.threads,
+        )
+        out = ""
+        for token in generator:
+            if callback:
+                callback(token)
+            out += token
+        return out
+
+    def generate_with_streaming(
+        self,
+        context="",
+        token_count=20,
+        temperature=1,
+        top_p=1,
+        top_k=50,
+        repetition_penalty=1,
+        callback=None,
+    ):
+        context = context if type(context) is str else context.decode()
+        generator = self.model._stream(
+            prompt=context,
+            max_new_tokens=token_count,
+            top_k=top_k,
+            top_p=top_p,
+            temperature=temperature,
+            repetition_penalty=repetition_penalty,
+            threads=shared.args.threads,
+        )
+        reply = ""
+        for token in generator:
+            reply += token
+            yield reply
diff --git a/modules/text_generation.py b/modules/text_generation.py
@@ -36,7 +36,7 @@ def get_max_prompt_length(state):
 
 
 def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_length=None):
-    if shared.model_type in ['rwkv', 'llamacpp']:
+    if shared.model_type in ['rwkv', 'llamacpp', 'starcoder', 'starchat']:
         input_ids = shared.tokenizer.encode(str(prompt))
         input_ids = np.array(input_ids).reshape(1, len(input_ids))
         return input_ids
@@ -56,7 +56,10 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
     if truncation_length is not None:
         input_ids = input_ids[:, -truncation_length:]
 
-    if shared.model_type in ['rwkv', 'llamacpp'] or shared.args.cpu:
+    if (
+        shared.model_type in ['rwkv', 'llamacpp', 'starcoder', 'starchat']
+        or shared.args.cpu
+    ):
         return input_ids
     elif shared.args.flexgen:
         return input_ids.numpy()
@@ -170,7 +173,7 @@ def _generate_reply(question, state, eos_token=None, stopping_strings=None, is_c
             yield question
             return
 
-        if shared.model_type in ['rwkv', 'llamacpp']:
+        if shared.model_type in ['rwkv', 'llamacpp', 'starcoder', 'starchat']:
             generate_func = generate_reply_custom
         elif shared.args.flexgen:
             generate_func = generate_reply_flexgen

diff --git a/requirements.txt b/requirements.txt
@@ -20,3 +20,4 @@ bitsandbytes==0.39.0; platform_system != "Windows"
 https://github.com/jllllll/bitsandbytes-windows-webui/raw/main/bitsandbytes-0.39.0-py3-none-any.whl; platform_system == "Windows"
 llama-cpp-python==0.1.53; platform_system != "Windows"
 https://github.com/abetlen/llama-cpp-python/releases/download/v0.1.53/llama_cpp_python-0.1.53-cp310-cp310-win_amd64.whl; platform_system == "Windows"
+ctransformers==0.2.1