From 5ceac52ab40ad7f5905d9df6e5decd8405742bc4 Mon Sep 17 00:00:00 2001 From: cal066 Date: Wed, 26 Jul 2023 05:00:06 +0000 Subject: [PATCH] ctransformers: another attempt Generalized ctransformers based on: https://github.com/oobabooga/text-generation-webui/pull/2892 Credits to randoentity --- modules/ctransformers_model.py | 76 ++++++++++++++++++++++++++++++++++ modules/loaders.py | 41 ++++++++++++++++++ modules/models.py | 22 +++++++++- modules/shared.py | 2 +- modules/text_generation.py | 16 ++++--- requirements.txt | 1 + server.py | 5 ++- 7 files changed, 154 insertions(+), 9 deletions(-) create mode 100644 modules/ctransformers_model.py diff --git a/modules/ctransformers_model.py b/modules/ctransformers_model.py new file mode 100644 index 0000000000..7bce57d9fa --- /dev/null +++ b/modules/ctransformers_model.py @@ -0,0 +1,76 @@ +from ctransformers import AutoModelForCausalLM +from ctransformers import AutoConfig + +from modules import shared +from modules.callbacks import Iteratorize +from modules.logging_colors import logger + +class CtransformersModel: + def __init__(self): + pass + + @classmethod + def from_pretrained(self, path): + result = self() + stops = shared.settings['custom_stopping_strings'] + stops.append("<|end|>") + + # ctransformers uses -1 for random seed + config = AutoConfig.from_pretrained( + str(path), + stop=stops, + threads=shared.args.threads, + gpu_layers=shared.args.n_gpu_layers, + batch_size=shared.args.n_batch, + stream=not shared.args.no_stream, + seed=(-1 if shared.args.llama_cpp_seed == 0 else shared.args.llama_cpp_seed) + ) + self.model = AutoModelForCausalLM.from_pretrained( + str(result.model_dir(path) if result.model_type_is_auto() else path), + model_type=(None if result.model_type_is_auto() else shared.args.model_type), + config=config + ) + logger.info(f'Using ctransformers model_type: {self.model.model_type} for {self.model.model_path}') + return result, result + + def model_type_is_auto(self): + return shared.args.model_type == "Auto" or shared.args.model_type == "None" + + def model_dir(self, path): + if path.is_file(): + return path.parent + return path + + def encode(self, string, **kwargs): + return self.model.tokenize(string) + + def decode(self, ids): + return self.model.detokenize(ids) + + + def generate(self, prompt, state, callback=None): + prompt = prompt if type(prompt) is str else prompt.decode() + generator = self.model._stream( + prompt=prompt, + max_new_tokens=state['max_new_tokens'], + temperature=state['temperature'], + top_p=state['top_p'], + top_k=state['top_k'], + repetition_penalty=state['repetition_penalty'], + threads=shared.args.threads + ) + + output = "" + for token in generator: + if callback: + callback(token) + output += token + return output + + + def generate_with_streaming(self, *args, **kwargs): + with Iteratorize(self.generate, args, kwargs, callback=None) as generator: + reply = '' + for token in generator: + reply += token + yield reply diff --git a/modules/loaders.py b/modules/loaders.py index 68b48204ed..6fabfa5d26 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -86,6 +86,16 @@ 'compress_pos_emb', 'alpha_value', 'exllama_HF_info', + ], + 'ctransformers': [ + 'n_ctx', + 'n_gpu_layers', + 'n_batch', + 'threads', + 'no_mmap', + 'mlock', + 'model_type', + 'llama_cpp_seed', ] } @@ -244,6 +254,13 @@ 'skip_special_tokens', 'auto_max_new_tokens', }, + 'ctransformers': { + 'temperature', + 'top_p', + 'top_k', + 'repetition_penalty', + 'seed' + } } @@ -264,6 +281,30 @@ def blacklist_samplers(loader): else: return [gr.update(visible=True) if sampler in loaders_samplers[loader] else gr.update(visible=False) for sampler in all_samplers] +model_loader_type_table = { + 'GPTQ-for-LLaMa': [ + "None", + "llama", + "opt", + "gptj" + ], + 'ctransformers': [ + "None", + "gptj", + "gpt_neox", + "llama", + "mpt", + "dolly-v2" + "replit", + "starcoder", + "falcon" + ], +} + +def model_loader_type(loader): + if loader in model_loader_type_table: + return model_loader_type_table[loader] + return ["None"] def get_gpu_memory_keys(): return [k for k in shared.gradio if k.startswith('gpu_memory')] diff --git a/modules/models.py b/modules/models.py index 4866893afa..15070a9b21 100644 --- a/modules/models.py +++ b/modules/models.py @@ -58,7 +58,8 @@ def load_model(model_name, loader=None): 'llamacpp_HF': llamacpp_HF_loader, 'RWKV': RWKV_loader, 'ExLlama': ExLlama_loader, - 'ExLlama_HF': ExLlama_HF_loader + 'ExLlama_HF': ExLlama_HF_loader, + 'ctransformers': CtransformorsModel_loader, } p = Path(model_name) @@ -263,6 +264,25 @@ def llamacpp_HF_loader(model_name): return model, tokenizer +def CtransformorsModel_loader(model_name): + from modules.ctransformers_model import CtransformersModel + + path = Path(f'{shared.args.model_dir}/{model_name}') + logger.info(f'ctransformers loading: {path}\n') + ctrans = CtransformersModel() + if ctrans.model_type_is_auto(): + model_file = path + else: + if path.is_file(): + model_file = path + else: + model_file = list( + Path(f'{shared.args.model_dir}/{model_name}').glob('*.bin') + )[0] + logger.info(f'ctransformers weights detected: {model_file}\n') + model, tokenizer = ctrans.from_pretrained(model_file) + return model, tokenizer + def GPTQ_loader(model_name): # Monkey patch diff --git a/modules/shared.py b/modules/shared.py index a2782e6573..afc0c6c38f 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -35,7 +35,7 @@ 'autoload_model': False, 'max_new_tokens': 200, 'max_new_tokens_min': 1, - 'max_new_tokens_max': 4096, + 'max_new_tokens_max': 8000, 'auto_max_new_tokens': False, 'seed': -1, 'character': 'None', diff --git a/modules/text_generation.py b/modules/text_generation.py index f6f71990ab..813139ad31 100644 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -34,9 +34,10 @@ def generate_reply(*args, **kwargs): def get_max_prompt_length(state): return state['truncation_length'] - state['max_new_tokens'] - +encode_llama_prompts = ['LlamaCppModel', 'RWKVModel', 'CtransformersModel'] +encode_llama_truncation = ['LlamaCppModel', 'RWKVModel', 'ExllamaModel', 'CtransformersModel'] def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_length=None): - if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel']: + if shared.model.__class__.__name__ in encode_llama_prompts: input_ids = shared.tokenizer.encode(str(prompt)) input_ids = np.array(input_ids).reshape(1, len(input_ids)) return input_ids @@ -51,7 +52,7 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt if truncation_length is not None: input_ids = input_ids[:, -truncation_length:] - if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel', 'ExllamaModel'] or shared.args.cpu: + if shared.model.__class__.__name__ in encode_llama_truncation or shared.args.cpu: return input_ids elif shared.args.deepspeed: return input_ids.to(device=local_rank) @@ -169,7 +170,12 @@ def apply_stopping_strings(reply, all_stop_strings): return reply, stop_found - +_generate_reply_use_custom = [ + 'LlamaCppModel', + 'RWKVModel', + 'ExllamaModel', + 'CtransformersModel' +] def _generate_reply(question, state, stopping_strings=None, is_chat=False): generate_func = apply_extensions('custom_generate_reply') if generate_func is None: @@ -178,7 +184,7 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False): yield '' return - if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel', 'ExllamaModel']: + if shared.model.__class__.__name__ in _generate_reply_use_custom: generate_func = generate_reply_custom else: generate_func = generate_reply_HF diff --git a/requirements.txt b/requirements.txt index 9486f808d3..fff7031738 100644 --- a/requirements.txt +++ b/requirements.txt @@ -31,3 +31,4 @@ https://github.com/abetlen/llama-cpp-python/releases/download/v0.1.77/llama_cpp_ # llama-cpp-python with CUDA support https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.1.77+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows" https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.1.77+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/jllllll/ctransformers-cuBLAS-wheels/releases/download/AVX2/ctransformers-0.2.16+cu117-py3-none-any.whl diff --git a/server.py b/server.py index d622cdbe85..73adb571aa 100644 --- a/server.py +++ b/server.py @@ -204,7 +204,7 @@ def create_model_menus(): with gr.Row(): with gr.Column(): - shared.gradio['loader'] = gr.Dropdown(label="Model loader", choices=["Transformers", "ExLlama_HF", "ExLlama", "AutoGPTQ", "GPTQ-for-LLaMa", "llama.cpp", "llamacpp_HF"], value=None) + shared.gradio['loader'] = gr.Dropdown(label="Model loader", choices=loaders.loaders_and_params.keys(), value=None) with gr.Box(): with gr.Row(): with gr.Column(): @@ -225,7 +225,7 @@ def create_model_menus(): shared.gradio['wbits'] = gr.Dropdown(label="wbits", choices=["None", 1, 2, 3, 4, 8], value=str(shared.args.wbits) if shared.args.wbits > 0 else "None") shared.gradio['groupsize'] = gr.Dropdown(label="groupsize", choices=["None", 32, 64, 128, 1024], value=str(shared.args.groupsize) if shared.args.groupsize > 0 else "None") - shared.gradio['model_type'] = gr.Dropdown(label="model_type", choices=["None", "llama", "opt", "gptj"], value=shared.args.model_type or "None") + shared.gradio['model_type'] = gr.Dropdown(label="model_type", choices=["None"], value=shared.args.model_type or "None") shared.gradio['pre_layer'] = gr.Slider(label="pre_layer", minimum=0, maximum=100, value=shared.args.pre_layer[0] if shared.args.pre_layer is not None else 0) shared.gradio['autogptq_info'] = gr.Markdown('* ExLlama_HF is recommended over AutoGPTQ for models derived from LLaMA.') shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7') @@ -267,6 +267,7 @@ def create_model_menus(): shared.gradio['model_status'] = gr.Markdown('No model is loaded' if shared.model_name == 'None' else 'Ready') shared.gradio['loader'].change(loaders.make_loader_params_visible, gradio('loader'), gradio(loaders.get_all_params())) + shared.gradio['loader'].change(fn=lambda value: gr.update(choices=loaders.model_loader_type(value)), inputs=shared.gradio['loader'], outputs=shared.gradio['model_type']) # In this event handler, the interface state is read and updated # with the model defaults (if any), and then the model is loaded