Skip to content

Commit

Permalink
Merge branch 'master' into rf-channel-split-1
Browse files Browse the repository at this point in the history
  • Loading branch information
dave-gray101 authored Apr 21, 2024
2 parents ead30a5 + 38c9abe commit 67ea9ef
Show file tree
Hide file tree
Showing 4 changed files with 22 additions and 7 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ BINARY_NAME=local-ai

# llama.cpp versions
GOLLAMA_STABLE_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
CPPLLAMA_VERSION?=0e4802b2ecbaab04b4f829fde4a3096ca19c84b5
CPPLLAMA_VERSION?=b8109bc0139f15a5b321909f47510b89dca47ffc

# gpt4all version
GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@

[Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)

- llama3: https://github.com/mudler/LocalAI/discussions/2076
- Parler-TTS: https://github.com/mudler/LocalAI/pull/2027
- Landing page: https://github.com/mudler/LocalAI/pull/1922
- Openvino support: https://github.com/mudler/LocalAI/pull/1892
Expand Down
22 changes: 18 additions & 4 deletions backend/python/transformers/transformers_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,8 @@ def LoadModel(self, request, context):
else:
device_map="CPU"
self.model = OVModelForCausalLM.from_pretrained(model_name,
compile=True,
compile=True,
ov_config={"PERFORMANCE_HINT": "LATENCY"},
device=device_map)
self.OV = True
else:
Expand Down Expand Up @@ -212,12 +213,25 @@ async def _predict(self, request, context, streaming=False):
set_seed(request.Seed)
if request.TopP == 0:
request.TopP = 0.9

if request.TopK == 0:
request.TopK = 40

max_tokens = 200
if request.Tokens > 0:
max_tokens = request.Tokens

inputs = self.tokenizer(request.Prompt, return_tensors="pt")
prompt = request.Prompt
if not request.Prompt and request.UseTokenizerTemplate and request.Messages:
prompt = self.tokenizer.apply_chat_template(request.Messages, tokenize=False, add_generation_prompt=True)

eos_token_id = self.tokenizer.eos_token_id
if request.StopPrompts:
eos_token_id = []
for word in request.StopPrompts:
eos_token_id.append(self.tokenizer.convert_tokens_to_ids(word))

inputs = self.tokenizer(prompt, return_tensors="pt")
if self.CUDA:
inputs = inputs.to("cuda")
if XPU and self.OV == False:
Expand All @@ -235,7 +249,7 @@ async def _predict(self, request, context, streaming=False):
top_k=request.TopK,
do_sample=True,
attention_mask=inputs["attention_mask"],
eos_token_id=self.tokenizer.eos_token_id,
eos_token_id=eos_token_id,
pad_token_id=self.tokenizer.eos_token_id,
streamer=streamer)
thread=Thread(target=self.model.generate, kwargs=config)
Expand Down Expand Up @@ -264,7 +278,7 @@ async def _predict(self, request, context, streaming=False):
top_k=request.TopK,
do_sample=True,
attention_mask=inputs["attention_mask"],
eos_token_id=self.tokenizer.eos_token_id,
eos_token_id=eos_token_id,
pad_token_id=self.tokenizer.eos_token_id)
generated_text = self.tokenizer.batch_decode(outputs[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True)[0]

Expand Down
4 changes: 2 additions & 2 deletions core/config/backend_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ func (cfg *BackendConfig) SetDefaults(opts ...ConfigLoaderOption) {
defaultMirostatETA := 0.1
defaultTypicalP := 1.0
defaultTFZ := 1.0
defaultInfinity := -1
defaultZero := 0

// Try to offload all GPU layers (if GPU is found)
defaultHigh := 99999999
Expand Down Expand Up @@ -254,7 +254,7 @@ func (cfg *BackendConfig) SetDefaults(opts ...ConfigLoaderOption) {
}

if cfg.Maxtokens == nil {
cfg.Maxtokens = &defaultInfinity
cfg.Maxtokens = &defaultZero
}

if cfg.Mirostat == nil {
Expand Down

0 comments on commit 67ea9ef

Please sign in to comment.