diff --git a/cht-llama-cpp/build-aarch64-apple-darwin.sh b/cht-llama-cpp/build-aarch64-apple-darwin.sh index 121c777..7b226a9 100755 --- a/cht-llama-cpp/build-aarch64-apple-darwin.sh +++ b/cht-llama-cpp/build-aarch64-apple-darwin.sh @@ -1,16 +1,17 @@ #!/usr/bin/env bash set -e -export VERSION=1.1.2 +export VERSION=1.2.0 test -f venv/bin/activate || python -m venv venv source venv/bin/activate -pip install -r requirements.txt pyinstaller +CMAKE_FORCE=1 CMAKE_ARGS="-DLLAMA_METAL=on" pip install -r requirements.txt pyinstaller LLAMA_CPP_PATH=$(python -c 'import llama_cpp; print(llama_cpp.__path__[0])') NAME=cht-llama-cpp-mistral-${VERSION}-aarch64-apple-darwin pyinstaller --onefile \ --target-arch arm64 \ --add-binary "$LLAMA_CPP_PATH/libllama.dylib:llama_cpp" \ + --add-binary "$LLAMA_CPP_PATH/ggml-metal.metal:llama_cpp" \ --name=$NAME \ main.py cp dist/$NAME dist/cht-llama-cpp-mistral-${VERSION%%.*}-aarch64-apple-darwin diff --git a/cht-llama-cpp/build.sh b/cht-llama-cpp/build.sh index 8dd8420..6607fc1 100755 --- a/cht-llama-cpp/build.sh +++ b/cht-llama-cpp/build.sh @@ -1,7 +1,8 @@ #!/bin/bash set -e -export VERSION=1.1.2 +export VERSION=1.2.0 source "$(dirname "${BASH_SOURCE[0]}")/../utils.sh" build_cpu ghcr.io/premai-io/chat-mistral-7b-instruct-q5 mistral-7b-instruct-v0.1.Q5_0 --build-arg="MODEL_ID=mistral-7b-instruct-v0.1.Q5_0" --build-arg="MODEL_DOWNLOAD_URL=https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF/resolve/main/mistral-7b-instruct-v0.1.Q5_0.gguf" ${@:1} build_cpu ghcr.io/premai-io/chat-mistral-7b-openorca-q5 mistral-7b-openorca.Q5_K_S --build-arg="MODEL_ID=mistral-7b-openorca.Q5_K_S" --build-arg="MODEL_DOWNLOAD_URL=https://huggingface.co/TheBloke/Mistral-7B-OpenOrca-GGUF/resolve/main/mistral-7b-openorca.Q5_K_S.gguf" ${@:1} +build_cpu ghcr.io/premai-io/chat-mistral-7b-yarn-q4 yarn-mistral-7b-128k.Q4_K_M --build-arg="MODEL_ID=yarn-mistral-7b-128k.Q4_K_M" --build-arg="MODEL_DOWNLOAD_URL=https://huggingface.co/TheBloke/Yarn-Mistral-7B-128k-GGUF/blob/main/yarn-mistral-7b-128k.Q3_K_M.gguf" ${@:1} diff --git a/cht-llama-cpp/docker/cpu/Dockerfile b/cht-llama-cpp/docker/cpu/Dockerfile index 8344fde..74d0385 100644 --- a/cht-llama-cpp/docker/cpu/Dockerfile +++ b/cht-llama-cpp/docker/cpu/Dockerfile @@ -15,4 +15,5 @@ RUN wget -O ./ml/models/${MODEL_ID}.gguf ${MODEL_DOWNLOAD_URL} COPY . . ENV MODEL_ID=$MODEL_ID +ENV DEVICE=cpu CMD python main.py diff --git a/cht-llama-cpp/main.py b/cht-llama-cpp/main.py index 6a5d41b..72ababe 100644 --- a/cht-llama-cpp/main.py +++ b/cht-llama-cpp/main.py @@ -13,17 +13,18 @@ MODEL_PATH = f"./ml/models/{os.getenv('MODEL_ID', 'yarn-mistral-7b-128k.Q4_K_M')}.gguf" # Mistral gguf follows ChatML syntax # https://github.com/openai/openai-python/blob/main/chatml.md - +PROMPT_TEMPLATE_STRING = '{"system_prompt_template": "<|im_start|>system\\n{}\\n<|im_end|>\\n", "default_system_text": "You are an helpful AI assistant.", "user_prompt_template": "<|im_start|>user\\n{}\\n<|im_end|>\\n", "assistant_prompt_template": "<|im_start|>assistant\\n{}\\n<|im_end|>\\n", "request_assistant_response_token": "<|im_start|>assistant\\n", "template_format": "chatml"}' # noqa +MODEL_CTX = 4096 if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--model-path", help="Path to GGUF", default=MODEL_PATH) parser.add_argument("--port", help="Port to run model server on", type=int, default=8000) - parser.add_argument("--ctx", help="Context dimension", type=int, default=4096) + parser.add_argument("--ctx", help="Context dimension", type=int, default=MODEL_CTX) parser.add_argument( "--prompt_template", help="Prompt Template", type=str, - default='{"system_prompt_template": "<|im_start|>system\\n{}\\n<|im_end|>\\n", "default_system_text": "You are an helpful AI assistant.", "user_prompt_template": "<|im_start|>user\\n{}\\n<|im_end|>\\n", "assistant_prompt_template": "<|im_start|>assistant\\n{}\\n<|im_end|>\\n", "request_assistant_response_token": "<|im_start|>assistant\\n", "template_format": "chatml"}', # noqa + default=PROMPT_TEMPLATE_STRING, # noqa ) # noqa args = parser.parse_args() MODEL_PATH = args.model_path diff --git a/cht-llama-cpp/models.py b/cht-llama-cpp/models.py index af4b944..6f34a64 100644 --- a/cht-llama-cpp/models.py +++ b/cht-llama-cpp/models.py @@ -1,5 +1,6 @@ import json import multiprocessing +import os from typing import Any, Dict, List from llama_cpp import Llama, llama_chat_format, llama_types @@ -66,7 +67,8 @@ def get_model(cls, model_path, prompt_template_jsonstr, n_ctx): cls.PROMPT_TEMPLATE = json.loads(prompt_template_jsonstr) chat_format = cls.PROMPT_TEMPLATE.get("template_format", "chatml") if cls.model is None: - cls.model = Llama(model_path, chat_format=chat_format, n_ctx=n_ctx) + gpu_offload_layers = -1 if os.getenv("DEVICE") != "cpu" else 0 + cls.model = Llama(model_path, chat_format=chat_format, n_ctx=n_ctx, n_gpu_layers=gpu_offload_layers) return cls.model diff --git a/cht-llama-cpp/requirements.txt b/cht-llama-cpp/requirements.txt index cf7f103..38f208f 100644 --- a/cht-llama-cpp/requirements.txt +++ b/cht-llama-cpp/requirements.txt @@ -6,4 +6,4 @@ tqdm==4.65.0 httpx==0.23.3 python-dotenv==1.0.0 tenacity==8.2.2 -llama-cpp-python==0.2.14 +llama-cpp-python==0.2.18