Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix #8, fix #41 Add llamacpp support #11

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ yarn-debug.log*
yarn-error.log*
yarn.lock*
package-lock.json
pnpm-lock.yaml
Cargo.lock

# Runtime data
Expand Down
54 changes: 34 additions & 20 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ This runs a Flask process, so you can add the typical flags such as setting a di
```sh
$ git clone https://github.com/nat/openplayground
$ cd app && npm install && npx parcel watch src/index.html --no-cache
$ cd server && pip3 install -r requirements.txt && cd .. && python3 -m server.app
$ cd server && pip3 install -r requirements.txt && cd .. && python3 -m server.app -m ./server/models.json
```

## Docker
Expand Down Expand Up @@ -75,28 +75,42 @@ You can add models in `server/models.json` with the following schema:

#### Local inference

For models running locally on your device you can add them to openplayground like the following (a minimal example):
For models running locally on your device you can add llama-cpp-python dependency and set **LLAMA-7B_MODEL_BIN_PATH** and **LLAMA-7B_MODEL_PROMPT_PATH** variable in .env file. the **LLAMA-7B** part should match the name in models.json.

The LLAMA-7B_MODEL_PROMPT_PATH file should match the model prompt format. Here is some example:

##### Llama

```
Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.

User: Hello, Bob.
Bob: Hello. How may I help you today?
User: Please tell me the largest city in Europe.
Bob: Sure. The largest city in Europe is Moscow, the capital of Russia.
User:{prompt}
Bob:
```

##### Alpaca

```
### Instruction:
{prompt}

### Response:

```json
"llama": {
"api_key" : false,
"models" : {
"llama-70b": {
"parameters": {
"temperature": {
"value": 0.5,
"range": [
0.1,
1.0
]
},
}
}
}
}
```

Keep in mind you will need to add a generation method for your model in `server/app.py`. Take a look at `local_text_generation()` as an example.
##### Vicuna

```
### Human:{prompt}
### Assistant:
```


Keep in mind you will need to add a generation method for your model in `server/app.py`. Take a look at `local_text_generation_llama()` as an example.

#### API Provider Inference

Expand Down
1 change: 1 addition & 0 deletions app/src/components/parameters-side-panel.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import {handleSelectModel} from "../lib/utils"

const modelProviders = {
forefront: "Forefront",
"llama-local": "Llama (Local)",
"huggingface-local": "Hugging Face (Local)",
huggingface: "Hugging Face",
"aleph-alpha": "Aleph Alpha",
Expand Down
2 changes: 2 additions & 0 deletions app/src/lib/editor-styles.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ export const styleMap = {
return styles.openai;
case "huggingface-local":
return styles.huggingface_local;
case "llama-local":
return styles.huggingface_local;
case "cohere":
return styles.cohere;
case "huggingface":
Expand Down
2 changes: 2 additions & 0 deletions server/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,8 @@ def text_generation(self, inference_request: InferenceRequest):

if inference_request.model_provider == "openai":
return self.inference_manager.openai_text_generation(provider_details, inference_request)
elif inference_request.model_provider == "llama-local":
return self.inference_manager.local_text_generation_llama(provider_details, inference_request)
elif inference_request.model_provider == "cohere":
return self.inference_manager.cohere_text_generation(provider_details, inference_request)
elif inference_request.model_provider == "huggingface":
Expand Down
2 changes: 1 addition & 1 deletion server/lib/api/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,6 @@ def split_tasks_by_provider(tasks: List[InferenceRequest]) -> Tuple[List[Inferen
local_tasks, remote_tasks = [], []

for task in tasks:
(local_tasks if task.model_provider == "huggingface-local" else remote_tasks).append(task)
(local_tasks if "-local" in task.model_provider else remote_tasks).append(task)

return local_tasks, remote_tasks
48 changes: 47 additions & 1 deletion server/lib/inference/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from pathlib import Path
import anthropic
import cachetools
import math
Expand All @@ -15,6 +16,7 @@
from dataclasses import dataclass
from typing import Callable, Union
from .huggingface.hf import HFInference
from llama_cpp import Llama

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
Expand Down Expand Up @@ -191,7 +193,7 @@ def __error_handler__(self, inference_fn: InferenceFunction, provider_details: P
logger.error(f"Error parsing response from API: {e}")
except Exception as e:
infer_result.token = f"[ERROR] {e}"
logger.error(f"Error: {e}")
logger.exception(f"Error: {e}")
finally:
if infer_result.token is None:
infer_result.token = "[COMPLETED]"
Expand Down Expand Up @@ -601,6 +603,50 @@ def __local_text_generation__(self, provider_details: ProviderDetails, inference
def local_text_generation(self, provider_details: ProviderDetails, inference_request: InferenceRequest):
self.__error_handler__(self.__local_text_generation__, provider_details, inference_request)

def __local_text_generation_llama__(self, provider_details: ProviderDetails, inference_request: InferenceRequest):
cancelled = False
env_model_bin_path = inference_request.model_name.upper() + '_MODEL_BIN_PATH'
env_model_prompt_path = inference_request.model_name.upper() + '_MODEL_PROMPT_PATH'
llama_modlel_path = os.environ.get(env_model_bin_path)
llama_prompt_path = os.environ.get(env_model_prompt_path)
if not llama_modlel_path:
logger.error(f"please add {env_model_bin_path} to the dot env file of environment variable if you want to use this model!")
return
if not llama_prompt_path:
logger.warning(f"please add {llama_prompt_path} prompt template file path with {{prompt}} format string to the dot env file of environment variable if you want to use this model with custom prompt format.")
llama_prompt_template = "{prompt}"
else:
with open(Path(llama_prompt_path)) as f:
llama_prompt_template = f.read()
llm = Llama(model_path=llama_modlel_path)
prompt_final = llama_prompt_template.format(prompt=inference_request.prompt)
stream = llm(
prompt_final,
max_tokens=inference_request.model_parameters['maximumLength'],
temperature=float(inference_request.model_parameters['temperature']),
top_p=float(inference_request.model_parameters['topP']),
repeat_penalty=float(inference_request.model_parameters['repetitionPenalty']),
stop=inference_request.model_parameters['stopSequences'],
stream=True,
)
for output in stream:
if cancelled: break
infer_response = InferenceResult(
uuid=inference_request.uuid,
model_name=inference_request.model_name,
model_tag=inference_request.model_tag,
model_provider=inference_request.model_provider,
token=output['choices'][0]['text'],
probability=None,
top_n_distribution=None
)
if not self.announcer.announce(infer_response, event="infer"):
cancelled = True
logger.info(f"Cancelled inference for {inference_request.uuid} - {inference_request.model_name}")

def local_text_generation_llama(self, provider_details: ProviderDetails, inference_request: InferenceRequest):
self.__error_handler__(self.__local_text_generation_llama__, provider_details, inference_request)

def __anthropic_text_generation__(self, provider_details: ProviderDetails, inference_request: InferenceRequest):
c = anthropic.Client(provider_details.api_key)

Expand Down
Loading