Merge pull request #117 from filopedraz/feat/new-llama-cpp-services

premAI-io · Oct 18, 2023 · beb0aab · beb0aab
2 parents 422dbcb + 13a3a90
commit beb0aab
Show file tree

Hide file tree

Showing 12 changed files with 117 additions and 76 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -1,6 +1,6 @@
 name: CI
 on:
-  push: {branches: [main]}
+  push: {branches: [main], tags: ['v**']}
   pull_request_target: {paths: ['*-*/**']}
   schedule: [{cron: '0 11 * * 6'}]  # M H d m w (Sat 11:00)
 jobs:
@@ -16,7 +16,8 @@ jobs:
       name: List modified models
       run: |
         if test "${{ github.event_name }}" = schedule; then
-          echo "modified=$(ls -d *-* | sort | jq -Rsc 'split("\n")[:-1]')" >> $GITHUB_OUTPUT
+          echo "debug: collecting all models"
+          MODELS=$(ls -d *-* | sort | jq -Rsc 'split("\n")[:-1]')
         else
           if [[ "${{ github.event_name }}" == pull_request* ]]; then
             git fetch origin --depth=2 "${{ github.base_ref }}"
@@ -26,12 +27,20 @@ jobs:
           fi
           echo "debug: base: $BASE"
           git diff --stat $BASE
-          echo "modified=$(
+          echo "debug: collecting changed models"
+          MODELS=$(
             for model in $(ls -d *-*); do
               git diff --quiet $BASE -- $model || echo $model
             done | sort | jq -Rsc 'split("\n")[:-1]'
-          )" >> $GITHUB_OUTPUT
+          )
         fi
+        echo "modified=$MODELS" >> $GITHUB_OUTPUT
+        echo "debug: collecting binary builds"
+        echo "modified_aarch64_macos=$(
+          for model in $(echo "$MODELS" | jq -r .[]); do
+            test -f $model/build-aarch64-apple-darwin.sh && echo $model || :
+          done | sort | jq -Rsc 'split("\n")[:-1]'
+        )" >> $GITHUB_OUTPUT
     - name: Launch self-hosted runners
       if: steps.list.outputs.modified != '[]'
       run: |
@@ -48,7 +57,8 @@ jobs:
         MACHINE: ${{ secrets.PAPERSPACE_MACHINE }}
     outputs:
       models: ${{ steps.list.outputs.modified }}
-  build-test-push:
+      models_macos-latest-xlarge: ${{ steps.list.outputs.modified_aarch64_macos }}
+  docker-build-test-push:
     concurrency: # terminate on new commits to PRs
       group: ${{ github.workflow }}-${{ github.head_ref || github.ref }}-${{ matrix.model }}
       cancel-in-progress: true
@@ -86,14 +96,49 @@ jobs:
         TESTS_SKIP_CPU: ${{ github.event_name == 'push' && '1' || '' }}
         # avoid OoM errors (TODO: remove when using larger GPU)
         TESTS_SKIP_GPU: ${{ contains(fromJSON('["a2t-whisper","cht-dolly-v2","cht-gorilla","cht-llama-v2","cht-mpt","cht-xgen","dfs-dalle","dfs-diffusers"]'), matrix.model) && '1' || '' }}
+  macos-aarch64-build-release:
+    concurrency: # terminate on new commits to PRs
+      group: ${{ github.workflow }}-${{ github.head_ref || github.ref }}-${{ matrix.model }}-bin
+      cancel-in-progress: true
+    needs: setup
+    runs-on: macos-latest-xlarge
+    strategy:
+      fail-fast: false
+      matrix:
+        model: ${{ fromJson(needs.setup.outputs.models_macos-latest-xlarge) }}
+    # if nothing to do, skip without failing
+    if: needs.setup.outputs.models_macos-latest-xlarge != '[]'
+    permissions:
+      contents: write
+    steps:
+    - uses: actions/checkout@v3
+      with:
+        ref: ${{ github.event.pull_request.head.sha || github.ref }}
+    - uses: actions/setup-python@v4
+      with: {python-version: 3.11}
+    - id: dist
+      name: build
+      run: |
+        pip install virtualenv
+        ./build-aarch64-apple-darwin.sh $(python -c 'import sys; print(".".join(map(str,sys.version_info[:2])))')
+      working-directory: ${{ matrix.model }}
+    - uses: actions/upload-artifact@v3
+      with: {path: dist/*}
+    - if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
+      name: Update release assets
+      run: |
+        tag="${GITHUB_REF#refs/tags/}"
+        gh release upload --clobber "${tag%%.*}" dist/*
+      env:
+        GH_TOKEN: ${{ github.token }}
   pass: # convenient single job to apply branch protection to
-    needs: build-test-push
+    needs: [docker-build-test-push, macos-aarch64-build-release]
     runs-on: ubuntu-latest
     steps:
     - run: echo success
   teardown:
     environment: ${{ github.event_name == 'pull_request_target' && ! contains('OWNER,MEMBER,COLLABORATOR', github.event.pull_request.author_association) && 'external' || 'internal' }}
-    needs: build-test-push
+    needs: docker-build-test-push
     if: always()
     runs-on: ubuntu-latest
     steps:

diff --git a/.gitignore b/.gitignore
@@ -160,3 +160,4 @@ cython_debug/
 #.idea/
 
 *.bin
+*.gguf
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -1,10 +1,10 @@
 {
-    "python.defaultInterpreterPath": "./venv/bin/python",
-    "python.formatting.provider": "none",
+    //"python.defaultInterpreterPath": "./venv/bin/python",
+    //"python.formatting.provider": "none",
     "editor.formatOnSave": true,
     "python.testing.unittestEnabled": false,
     "python.testing.pytestEnabled": true,
-    "python.testing.cwd": "${workspaceFolder}/ebd-all-minilm/",
+    //"python.testing.cwd": "${workspaceFolder}/ebd-all-minilm/",
     "python.analysis.inlayHints.functionReturnTypes": true,
     "[python]": {
         "editor.defaultFormatter": "ms-python.autopep8"

diff --git a/cht-llama-cpp/README.md b/cht-llama-cpp/README.md
@@ -0,0 +1,24 @@
+# Documentation
+
+## Download the Models
+
+e.g.:
+
+```bash
+mkdir -p ./ml/models/
+wget -P ./ml/models/ https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF/resolve/main/mistral-7b-instruct-v0.1.Q5_0.gguf
+wget -P ./ml/models/ https://huggingface.co/TheBloke/Mistral-7B-OpenOrca-GGUF/resolve/main/mistral-7b-openorca.Q5_K_S.gguf
+```
+
+## Compile the Backend
+
+```bash
+pip install virtualenv
+./build-aarch64-apple-darwin.sh
+```
+
+## Run the compiled file
+
+```bash
+./dist/mistral-1-aarch64-apple-darwin --model_path ./ml/models/mistral-7b-instruct-v0.1.Q5_0.gguf
+```
diff --git a/cht-llama-cpp/build-aarch64-apple-darwin.sh b/cht-llama-cpp/build-aarch64-apple-darwin.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+set -e
+export VERSION=1.0.0
+
+virtualenv venv -p=${1:-3.11}
+source ./venv/bin/activate
+pip install -r requirements.txt pyinstaller
+LLAMA_CPP_PATH=$(python -c 'import llama_cpp; print(llama_cpp.__path__[0])')
+# macOS (dylib) package
+NAME=mistral-${VERSION}-aarch64-apple-darwin
+pyinstaller --onefile \
+  --target-arch arm64 \
+  --add-binary "$LLAMA_CPP_PATH/libllama.dylib:llama_cpp" \
+  --name=$NAME \
+  --paths ./venv/lib/python${1:-3.11}/site-packages \
+  main.py
+cp dist/$NAME dist/mistral-${VERSION%%.*}-aarch64-apple-darwin
diff --git a/cht-llama-cpp/build.sh b/cht-llama-cpp/build.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 set -e
-export VERSION=1.0.5
+export VERSION=1.0.0
 source "$(dirname "${BASH_SOURCE[0]}")/../utils.sh"
 
-build_cpu ghcr.io/premai-io/chat-gpt4all-lora-q4-cpu gpt4all-lora-q4 ${@:1}
-build_cpu ghcr.io/premai-io/chat-vicuna-7b-q4-cpu    vicuna-7b-q4    ${@:1}
+build_cpu ghcr.io/premai-io/chat-mistral-7b-instruct-q5 mistral-7b-instruct-v0.1.Q5_0 --build-arg="MODEL_ID=mistral-7b-instruct-v0.1.Q5_0" --build-arg="MODEL_DOWNLOAD_URL=https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF/resolve/main/mistral-7b-instruct-v0.1.Q5_0.gguf" ${@:1}
+build_cpu ghcr.io/premai-io/chat-mistral-7b-openorca-q5 mistral-7b-openorca.Q5_K_S    --build-arg="MODEL_ID=mistral-7b-openorca.Q5_K_S"    --build-arg="MODEL_DOWNLOAD_URL=https://huggingface.co/TheBloke/Mistral-7B-OpenOrca-GGUF/resolve/main/mistral-7b-openorca.Q5_K_S.gguf"         ${@:1}
diff --git a/cht-llama-cpp/docker/cpu/Dockerfile b/cht-llama-cpp/docker/cpu/Dockerfile
@@ -9,8 +9,9 @@ COPY requirements.txt .
 RUN pip install --no-cache-dir -r ./requirements.txt
 
 ARG MODEL_ID
+ARG MODEL_DOWNLOAD_URL
 RUN mkdir -p ./ml/models/
-RUN wget -O ./ml/models/${MODEL_ID}.bin https://prem-models.s3.eu-central-1.amazonaws.com/${MODEL_ID}.bin
+RUN wget -O ./ml/models/${MODEL_ID}.gguf ${MODEL_DOWNLOAD_URL}
 
 COPY . .
 ENV MODEL_ID=$MODEL_ID

diff --git a/cht-llama-cpp/main.py b/cht-llama-cpp/main.py
@@ -1,4 +1,6 @@
+import argparse
 import logging
+import os
 
 import uvicorn
 from dotenv import load_dotenv
@@ -8,6 +10,13 @@
 
 load_dotenv()
 
+MODEL_PATH = f"./ml/models/{os.getenv('MODEL_ID', 'mistral-7b-instruct-v0.1.Q5_0')}.gguf"
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_path", help="Path to GGUF", default=MODEL_PATH)
+    args = parser.parse_args()
+    MODEL_PATH = args.model_path
+
 logging.basicConfig(
     format="%(asctime)s %(levelname)-8s %(message)s",
     level=logging.INFO,
@@ -19,7 +28,7 @@ def create_start_app_handler(app: FastAPI):
     def start_app() -> None:
         from models import LLaMACPPBasedModel
 
-        LLaMACPPBasedModel.get_model()
+        LLaMACPPBasedModel.get_model(MODEL_PATH)
 
     return start_app
 
@@ -42,4 +51,4 @@ def get_application() -> FastAPI:
 
 
 if __name__ == "__main__":
-    uvicorn.run("main:app", host="0.0.0.0", port=8000)
+    uvicorn.run(app, host="0.0.0.0", port=8000)
diff --git a/cht-llama-cpp/models.py b/cht-llama-cpp/models.py
@@ -1,19 +1,10 @@
 import multiprocessing
-import os
 
 from llama_cpp import Llama
 
-MODEL_ZOO = {
-    "gpt4all-lora-q4": {"modelWeightsName": "gpt4all-lora-q4.bin", "ctxMaxTokens": 512},
-    "vicuna-7b-q4": {"modelWeightsName": "vicuna-7b-q4.bin", "ctxMaxTokens": 512},
-}
 DEFAULT_N_THREADS = max(multiprocessing.cpu_count() // 2, 1)
 
 
-def get_model_info() -> dict:
-    return MODEL_ZOO[os.getenv("MODEL_ID", "vicuna-7b-q4")]
-
-
 class LLaMACPPBasedModel(object):
     model = None
 
@@ -24,7 +15,7 @@ def tokenize(cls, prompt):
     @classmethod
     def reduce_number_of_messages(cls, messages, max_tokens):
         buffer_tokens = 32
-        ctx_max_tokens = get_model_info()["ctxMaxTokens"]
+        ctx_max_tokens = 4096
         num_messages = len(messages)
 
         tokens = [len(cls.tokenize(doc["content"])) for doc in messages]
@@ -62,12 +53,9 @@ def generate(
         )
 
     @classmethod
-    def get_model(cls):
+    def get_model(cls, model_path):
         if cls.model is None:
-            cls.model = Llama(
-                model_path=f"./ml/models/{get_model_info()['modelWeightsName']}",
-                embedding=True,
-            )
+            cls.model = Llama(model_path)
 
         return cls.model
 

diff --git a/cht-llama-cpp/requirements.txt b/cht-llama-cpp/requirements.txt
@@ -6,4 +6,4 @@ tqdm==4.65.0
 httpx==0.23.3
 python-dotenv==1.0.0
 tenacity==8.2.2
-llama-cpp-python==0.1.43
+llama-cpp-python==0.2.11
diff --git a/cht-llama-cpp/routes.py b/cht-llama-cpp/routes.py
@@ -35,30 +35,6 @@ class ChatCompletionResponse(BaseModel):
     usage: dict = {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}
 
 
-class EmbeddingsInput(BaseModel):
-    model: str
-    input: str
-    user: str = ""
-
-
-class EmbeddingObject(BaseModel):
-    object: str = "embedding"
-    index: int = 0
-    embedding: list[float]
-
-
-class EmbeddingUsage(BaseModel):
-    prompt_tokens: int = 0
-    total_tokens: int = 0
-
-
-class EmbeddingsResponse(BaseModel):
-    object: str = "list"
-    data: list[EmbeddingObject]
-    model: str = ""
-    usage: EmbeddingUsage
-
-
 class HealthResponse(BaseModel):
     status: bool
 
@@ -113,14 +89,3 @@ async def chat_completions(body: ChatCompletionInput):
             status_code=400,
             detail={"message": str(error)},
         )
-
-
-@router.post("/embeddings", response_model=EmbeddingsResponse)
-async def embeddings(body: EmbeddingsInput):
-    try:
-        return model.embeddings(text=body.input)
-    except ValueError as error:
-        raise HTTPException(
-            status_code=400,
-            detail={"message": str(error)},
-        )
diff --git a/cht-llama-cpp/tests/test_views.py b/cht-llama-cpp/tests/test_views.py
@@ -24,12 +24,3 @@ def test_chat_llama_cpp() -> None:
             },
         )
         assert response.status_code == 200
-
-        response = client.post(
-            "/v1/embeddings",
-            json={
-                "model": "vicuna-7b-q4",
-                "input": "Hello!",
-            },
-        )
-        assert response.status_code == 200