Skip to content
This repository has been archived by the owner on Dec 6, 2023. It is now read-only.

Commit

Permalink
Merge pull request #117 from filopedraz/feat/new-llama-cpp-services
Browse files Browse the repository at this point in the history
  • Loading branch information
casperdcl authored Oct 18, 2023
2 parents 422dbcb + 13a3a90 commit beb0aab
Show file tree
Hide file tree
Showing 12 changed files with 117 additions and 76 deletions.
59 changes: 52 additions & 7 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name: CI
on:
push: {branches: [main]}
push: {branches: [main], tags: ['v**']}
pull_request_target: {paths: ['*-*/**']}
schedule: [{cron: '0 11 * * 6'}] # M H d m w (Sat 11:00)
jobs:
Expand All @@ -16,7 +16,8 @@ jobs:
name: List modified models
run: |
if test "${{ github.event_name }}" = schedule; then
echo "modified=$(ls -d *-* | sort | jq -Rsc 'split("\n")[:-1]')" >> $GITHUB_OUTPUT
echo "debug: collecting all models"
MODELS=$(ls -d *-* | sort | jq -Rsc 'split("\n")[:-1]')
else
if [[ "${{ github.event_name }}" == pull_request* ]]; then
git fetch origin --depth=2 "${{ github.base_ref }}"
Expand All @@ -26,12 +27,20 @@ jobs:
fi
echo "debug: base: $BASE"
git diff --stat $BASE
echo "modified=$(
echo "debug: collecting changed models"
MODELS=$(
for model in $(ls -d *-*); do
git diff --quiet $BASE -- $model || echo $model
done | sort | jq -Rsc 'split("\n")[:-1]'
)" >> $GITHUB_OUTPUT
)
fi
echo "modified=$MODELS" >> $GITHUB_OUTPUT
echo "debug: collecting binary builds"
echo "modified_aarch64_macos=$(
for model in $(echo "$MODELS" | jq -r .[]); do
test -f $model/build-aarch64-apple-darwin.sh && echo $model || :
done | sort | jq -Rsc 'split("\n")[:-1]'
)" >> $GITHUB_OUTPUT
- name: Launch self-hosted runners
if: steps.list.outputs.modified != '[]'
run: |
Expand All @@ -48,7 +57,8 @@ jobs:
MACHINE: ${{ secrets.PAPERSPACE_MACHINE }}
outputs:
models: ${{ steps.list.outputs.modified }}
build-test-push:
models_macos-latest-xlarge: ${{ steps.list.outputs.modified_aarch64_macos }}
docker-build-test-push:
concurrency: # terminate on new commits to PRs
group: ${{ github.workflow }}-${{ github.head_ref || github.ref }}-${{ matrix.model }}
cancel-in-progress: true
Expand Down Expand Up @@ -86,14 +96,49 @@ jobs:
TESTS_SKIP_CPU: ${{ github.event_name == 'push' && '1' || '' }}
# avoid OoM errors (TODO: remove when using larger GPU)
TESTS_SKIP_GPU: ${{ contains(fromJSON('["a2t-whisper","cht-dolly-v2","cht-gorilla","cht-llama-v2","cht-mpt","cht-xgen","dfs-dalle","dfs-diffusers"]'), matrix.model) && '1' || '' }}
macos-aarch64-build-release:
concurrency: # terminate on new commits to PRs
group: ${{ github.workflow }}-${{ github.head_ref || github.ref }}-${{ matrix.model }}-bin
cancel-in-progress: true
needs: setup
runs-on: macos-latest-xlarge
strategy:
fail-fast: false
matrix:
model: ${{ fromJson(needs.setup.outputs.models_macos-latest-xlarge) }}
# if nothing to do, skip without failing
if: needs.setup.outputs.models_macos-latest-xlarge != '[]'
permissions:
contents: write
steps:
- uses: actions/checkout@v3
with:
ref: ${{ github.event.pull_request.head.sha || github.ref }}
- uses: actions/setup-python@v4
with: {python-version: 3.11}
- id: dist
name: build
run: |
pip install virtualenv
./build-aarch64-apple-darwin.sh $(python -c 'import sys; print(".".join(map(str,sys.version_info[:2])))')
working-directory: ${{ matrix.model }}
- uses: actions/upload-artifact@v3
with: {path: dist/*}
- if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
name: Update release assets
run: |
tag="${GITHUB_REF#refs/tags/}"
gh release upload --clobber "${tag%%.*}" dist/*
env:
GH_TOKEN: ${{ github.token }}
pass: # convenient single job to apply branch protection to
needs: build-test-push
needs: [docker-build-test-push, macos-aarch64-build-release]
runs-on: ubuntu-latest
steps:
- run: echo success
teardown:
environment: ${{ github.event_name == 'pull_request_target' && ! contains('OWNER,MEMBER,COLLABORATOR', github.event.pull_request.author_association) && 'external' || 'internal' }}
needs: build-test-push
needs: docker-build-test-push
if: always()
runs-on: ubuntu-latest
steps:
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -160,3 +160,4 @@ cython_debug/
#.idea/

*.bin
*.gguf
6 changes: 3 additions & 3 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
{
"python.defaultInterpreterPath": "./venv/bin/python",
"python.formatting.provider": "none",
//"python.defaultInterpreterPath": "./venv/bin/python",
//"python.formatting.provider": "none",
"editor.formatOnSave": true,
"python.testing.unittestEnabled": false,
"python.testing.pytestEnabled": true,
"python.testing.cwd": "${workspaceFolder}/ebd-all-minilm/",
//"python.testing.cwd": "${workspaceFolder}/ebd-all-minilm/",
"python.analysis.inlayHints.functionReturnTypes": true,
"[python]": {
"editor.defaultFormatter": "ms-python.autopep8"
Expand Down
24 changes: 24 additions & 0 deletions cht-llama-cpp/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Documentation

## Download the Models

e.g.:

```bash
mkdir -p ./ml/models/
wget -P ./ml/models/ https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF/resolve/main/mistral-7b-instruct-v0.1.Q5_0.gguf
wget -P ./ml/models/ https://huggingface.co/TheBloke/Mistral-7B-OpenOrca-GGUF/resolve/main/mistral-7b-openorca.Q5_K_S.gguf
```

## Compile the Backend

```bash
pip install virtualenv
./build-aarch64-apple-darwin.sh
```

## Run the compiled file

```bash
./dist/mistral-1-aarch64-apple-darwin --model_path ./ml/models/mistral-7b-instruct-v0.1.Q5_0.gguf
```
17 changes: 17 additions & 0 deletions cht-llama-cpp/build-aarch64-apple-darwin.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/bin/bash
set -e
export VERSION=1.0.0

virtualenv venv -p=${1:-3.11}
source ./venv/bin/activate
pip install -r requirements.txt pyinstaller
LLAMA_CPP_PATH=$(python -c 'import llama_cpp; print(llama_cpp.__path__[0])')
# macOS (dylib) package
NAME=mistral-${VERSION}-aarch64-apple-darwin
pyinstaller --onefile \
--target-arch arm64 \
--add-binary "$LLAMA_CPP_PATH/libllama.dylib:llama_cpp" \
--name=$NAME \
--paths ./venv/lib/python${1:-3.11}/site-packages \
main.py
cp dist/$NAME dist/mistral-${VERSION%%.*}-aarch64-apple-darwin
6 changes: 3 additions & 3 deletions cht-llama-cpp/build.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/bin/bash
set -e
export VERSION=1.0.5
export VERSION=1.0.0
source "$(dirname "${BASH_SOURCE[0]}")/../utils.sh"

build_cpu ghcr.io/premai-io/chat-gpt4all-lora-q4-cpu gpt4all-lora-q4 ${@:1}
build_cpu ghcr.io/premai-io/chat-vicuna-7b-q4-cpu vicuna-7b-q4 ${@:1}
build_cpu ghcr.io/premai-io/chat-mistral-7b-instruct-q5 mistral-7b-instruct-v0.1.Q5_0 --build-arg="MODEL_ID=mistral-7b-instruct-v0.1.Q5_0" --build-arg="MODEL_DOWNLOAD_URL=https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF/resolve/main/mistral-7b-instruct-v0.1.Q5_0.gguf" ${@:1}
build_cpu ghcr.io/premai-io/chat-mistral-7b-openorca-q5 mistral-7b-openorca.Q5_K_S --build-arg="MODEL_ID=mistral-7b-openorca.Q5_K_S" --build-arg="MODEL_DOWNLOAD_URL=https://huggingface.co/TheBloke/Mistral-7B-OpenOrca-GGUF/resolve/main/mistral-7b-openorca.Q5_K_S.gguf" ${@:1}
3 changes: 2 additions & 1 deletion cht-llama-cpp/docker/cpu/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,9 @@ COPY requirements.txt .
RUN pip install --no-cache-dir -r ./requirements.txt

ARG MODEL_ID
ARG MODEL_DOWNLOAD_URL
RUN mkdir -p ./ml/models/
RUN wget -O ./ml/models/${MODEL_ID}.bin https://prem-models.s3.eu-central-1.amazonaws.com/${MODEL_ID}.bin
RUN wget -O ./ml/models/${MODEL_ID}.gguf ${MODEL_DOWNLOAD_URL}

COPY . .
ENV MODEL_ID=$MODEL_ID
Expand Down
13 changes: 11 additions & 2 deletions cht-llama-cpp/main.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import argparse
import logging
import os

import uvicorn
from dotenv import load_dotenv
Expand All @@ -8,6 +10,13 @@

load_dotenv()

MODEL_PATH = f"./ml/models/{os.getenv('MODEL_ID', 'mistral-7b-instruct-v0.1.Q5_0')}.gguf"
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model_path", help="Path to GGUF", default=MODEL_PATH)
args = parser.parse_args()
MODEL_PATH = args.model_path

logging.basicConfig(
format="%(asctime)s %(levelname)-8s %(message)s",
level=logging.INFO,
Expand All @@ -19,7 +28,7 @@ def create_start_app_handler(app: FastAPI):
def start_app() -> None:
from models import LLaMACPPBasedModel

LLaMACPPBasedModel.get_model()
LLaMACPPBasedModel.get_model(MODEL_PATH)

return start_app

Expand All @@ -42,4 +51,4 @@ def get_application() -> FastAPI:


if __name__ == "__main__":
uvicorn.run("main:app", host="0.0.0.0", port=8000)
uvicorn.run(app, host="0.0.0.0", port=8000)
18 changes: 3 additions & 15 deletions cht-llama-cpp/models.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,10 @@
import multiprocessing
import os

from llama_cpp import Llama

MODEL_ZOO = {
"gpt4all-lora-q4": {"modelWeightsName": "gpt4all-lora-q4.bin", "ctxMaxTokens": 512},
"vicuna-7b-q4": {"modelWeightsName": "vicuna-7b-q4.bin", "ctxMaxTokens": 512},
}
DEFAULT_N_THREADS = max(multiprocessing.cpu_count() // 2, 1)


def get_model_info() -> dict:
return MODEL_ZOO[os.getenv("MODEL_ID", "vicuna-7b-q4")]


class LLaMACPPBasedModel(object):
model = None

Expand All @@ -24,7 +15,7 @@ def tokenize(cls, prompt):
@classmethod
def reduce_number_of_messages(cls, messages, max_tokens):
buffer_tokens = 32
ctx_max_tokens = get_model_info()["ctxMaxTokens"]
ctx_max_tokens = 4096
num_messages = len(messages)

tokens = [len(cls.tokenize(doc["content"])) for doc in messages]
Expand Down Expand Up @@ -62,12 +53,9 @@ def generate(
)

@classmethod
def get_model(cls):
def get_model(cls, model_path):
if cls.model is None:
cls.model = Llama(
model_path=f"./ml/models/{get_model_info()['modelWeightsName']}",
embedding=True,
)
cls.model = Llama(model_path)

return cls.model

Expand Down
2 changes: 1 addition & 1 deletion cht-llama-cpp/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@ tqdm==4.65.0
httpx==0.23.3
python-dotenv==1.0.0
tenacity==8.2.2
llama-cpp-python==0.1.43
llama-cpp-python==0.2.11
35 changes: 0 additions & 35 deletions cht-llama-cpp/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,30 +35,6 @@ class ChatCompletionResponse(BaseModel):
usage: dict = {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}


class EmbeddingsInput(BaseModel):
model: str
input: str
user: str = ""


class EmbeddingObject(BaseModel):
object: str = "embedding"
index: int = 0
embedding: list[float]


class EmbeddingUsage(BaseModel):
prompt_tokens: int = 0
total_tokens: int = 0


class EmbeddingsResponse(BaseModel):
object: str = "list"
data: list[EmbeddingObject]
model: str = ""
usage: EmbeddingUsage


class HealthResponse(BaseModel):
status: bool

Expand Down Expand Up @@ -113,14 +89,3 @@ async def chat_completions(body: ChatCompletionInput):
status_code=400,
detail={"message": str(error)},
)


@router.post("/embeddings", response_model=EmbeddingsResponse)
async def embeddings(body: EmbeddingsInput):
try:
return model.embeddings(text=body.input)
except ValueError as error:
raise HTTPException(
status_code=400,
detail={"message": str(error)},
)
9 changes: 0 additions & 9 deletions cht-llama-cpp/tests/test_views.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,3 @@ def test_chat_llama_cpp() -> None:
},
)
assert response.status_code == 200

response = client.post(
"/v1/embeddings",
json={
"model": "vicuna-7b-q4",
"input": "Hello!",
},
)
assert response.status_code == 200

0 comments on commit beb0aab

Please sign in to comment.