nat · mmyjona · Apr 4, 2023 · Apr 4, 2023 · Apr 4, 2023 · Apr 4, 2023
diff --git a/.gitignore b/.gitignore
@@ -11,6 +11,7 @@ yarn-debug.log*
 yarn-error.log*
 yarn.lock*
 package-lock.json
+pnpm-lock.yaml
 Cargo.lock
 
 # Runtime data

diff --git a/README.md b/README.md
@@ -36,7 +36,7 @@ This runs a Flask process, so you can add the typical flags such as setting a di
 ```sh
 $ git clone https://github.com/nat/openplayground
 $ cd app && npm install && npx parcel watch src/index.html --no-cache
-$ cd server && pip3 install -r requirements.txt && cd .. && python3 -m server.app
+$ cd server && pip3 install -r requirements.txt && cd .. && python3 -m server.app -m ./server/models.json
 ```
 
 ## Docker
@@ -75,28 +75,42 @@ You can add models in `server/models.json` with the following schema:
 
 #### Local inference
 
-For models running locally on your device you can add them to openplayground like the following (a minimal example):
+For models running locally on your device you can add llama-cpp-python dependency and set **LLAMA-7B_MODEL_BIN_PATH** and **LLAMA-7B_MODEL_PROMPT_PATH** variable in .env file. the **LLAMA-7B** part should match the name in models.json. 
+
+The LLAMA-7B_MODEL_PROMPT_PATH file should match the model prompt format. Here is some example:
+
+##### Llama
+
+```
+Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
+
+User: Hello, Bob.
+Bob: Hello. How may I help you today?
+User: Please tell me the largest city in Europe.
+Bob: Sure. The largest city in Europe is Moscow, the capital of Russia.
+User:{prompt}
+Bob:
+```
+
+##### Alpaca
+
+```
+### Instruction:
+{prompt}
+
+### Response:
 
-```json
-"llama": {
-    "api_key" : false,
-    "models" : {
-        "llama-70b": {
-            "parameters": {
-                "temperature": {
-                    "value": 0.5,
-                    "range": [
-                        0.1,
-                        1.0
-                    ]
-                },
-            }
-        }
-    }
-}
 ```
 
-Keep in mind you will need to add a generation method for your model in `server/app.py`. Take a look at `local_text_generation()` as an example.
+##### Vicuna
+
+```
+### Human:{prompt}
+### Assistant:
+```
+
+
+Keep in mind you will need to add a generation method for your model in `server/app.py`. Take a look at `local_text_generation_llama()` as an example.
 
 #### API Provider Inference
 

diff --git a/app/src/components/parameters-side-panel.tsx b/app/src/components/parameters-side-panel.tsx
@@ -24,6 +24,7 @@ import {handleSelectModel} from "../lib/utils"
 
 const modelProviders = {
   forefront: "Forefront",
+  "llama-local": "Llama (Local)",
   "huggingface-local": "Hugging Face (Local)",
   huggingface: "Hugging Face",
   "aleph-alpha": "Aleph Alpha",

diff --git a/app/src/lib/editor-styles.tsx b/app/src/lib/editor-styles.tsx
@@ -57,6 +57,8 @@ export const styleMap = {
         return styles.openai;
       case "huggingface-local":
         return styles.huggingface_local;
+      case "llama-local":
+        return styles.huggingface_local;
       case "cohere":
         return styles.cohere;
       case "huggingface":

diff --git a/server/app.py b/server/app.py
@@ -299,6 +299,8 @@ def text_generation(self, inference_request: InferenceRequest):
 
         if inference_request.model_provider == "openai":
             return self.inference_manager.openai_text_generation(provider_details, inference_request)
+        elif inference_request.model_provider == "llama-local":
+            return self.inference_manager.local_text_generation_llama(provider_details, inference_request)
         elif inference_request.model_provider == "cohere":
             return self.inference_manager.cohere_text_generation(provider_details, inference_request)
         elif inference_request.model_provider == "huggingface":

diff --git a/server/lib/api/inference.py b/server/lib/api/inference.py
@@ -124,6 +124,6 @@ def split_tasks_by_provider(tasks: List[InferenceRequest]) -> Tuple[List[Inferen
     local_tasks, remote_tasks = [], []
 
     for task in tasks:
-        (local_tasks if task.model_provider == "huggingface-local" else remote_tasks).append(task)
+        (local_tasks if "-local" in task.model_provider else remote_tasks).append(task)
 
     return local_tasks, remote_tasks
diff --git a/server/lib/inference/__init__.py b/server/lib/inference/__init__.py
@@ -1,3 +1,4 @@
+from pathlib import Path
 import anthropic
 import cachetools
 import math
@@ -15,6 +16,7 @@
 from dataclasses import dataclass
 from typing import Callable, Union
 from .huggingface.hf import HFInference
+from llama_cpp import Llama
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -191,7 +193,7 @@ def __error_handler__(self, inference_fn: InferenceFunction, provider_details: P
                 logger.error(f"Error parsing response from API: {e}")
         except Exception as e:
             infer_result.token = f"[ERROR] {e}"
-            logger.error(f"Error: {e}")
+            logger.exception(f"Error: {e}")
         finally:
             if infer_result.token is None:
                 infer_result.token = "[COMPLETED]"
@@ -601,6 +603,50 @@ def __local_text_generation__(self, provider_details: ProviderDetails, inference
     def local_text_generation(self, provider_details: ProviderDetails, inference_request: InferenceRequest):
        self.__error_handler__(self.__local_text_generation__, provider_details, inference_request)
 
+    def __local_text_generation_llama__(self, provider_details: ProviderDetails, inference_request: InferenceRequest):
+        cancelled = False
+        env_model_bin_path = inference_request.model_name.upper() + '_MODEL_BIN_PATH'
+        env_model_prompt_path = inference_request.model_name.upper() + '_MODEL_PROMPT_PATH'
+        llama_modlel_path = os.environ.get(env_model_bin_path)
+        llama_prompt_path = os.environ.get(env_model_prompt_path)
+        if not llama_modlel_path:
+            logger.error(f"please add {env_model_bin_path} to the dot env file of environment variable if you want to use this model!")
+            return
+        if not llama_prompt_path:
+            logger.warning(f"please add {llama_prompt_path} prompt template file path with {{prompt}} format string to the dot env file of environment variable if you want to use this model with custom prompt format.")
+            llama_prompt_template = "{prompt}"
+        else:
+            with open(Path(llama_prompt_path)) as f:
+                llama_prompt_template = f.read()
+        llm = Llama(model_path=llama_modlel_path)
+        prompt_final = llama_prompt_template.format(prompt=inference_request.prompt)
+        stream = llm(
+            prompt_final,
+            max_tokens=inference_request.model_parameters['maximumLength'],
+            temperature=float(inference_request.model_parameters['temperature']),
+            top_p=float(inference_request.model_parameters['topP']),
+            repeat_penalty=float(inference_request.model_parameters['repetitionPenalty']),
+            stop=inference_request.model_parameters['stopSequences'],
+            stream=True,
+        )
+        for output in stream:
+            if cancelled: break
+            infer_response = InferenceResult(
+                uuid=inference_request.uuid,
+                model_name=inference_request.model_name,
+                model_tag=inference_request.model_tag,
+                model_provider=inference_request.model_provider,
+                token=output['choices'][0]['text'],
+                probability=None,
+                top_n_distribution=None
+            )
+            if not self.announcer.announce(infer_response, event="infer"):
+                cancelled = True
+                logger.info(f"Cancelled inference for {inference_request.uuid} - {inference_request.model_name}")
+
+    def local_text_generation_llama(self, provider_details: ProviderDetails, inference_request: InferenceRequest):
+       self.__error_handler__(self.__local_text_generation_llama__, provider_details, inference_request)
+
     def __anthropic_text_generation__(self, provider_details: ProviderDetails, inference_request: InferenceRequest):
         c = anthropic.Client(provider_details.api_key)