Skip to content

Commit

Permalink
Update llama.cpp submodule to latest release b3197 (#114)
Browse files Browse the repository at this point in the history
* Update submodule to latest release b3197

* fix: make embedding works again

* fix: update e2e python script

---------

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
Co-authored-by: vansangpfiev <[email protected]>
  • Loading branch information
3 people authored Jun 22, 2024
1 parent 7857e05 commit 186bcb2
Show file tree
Hide file tree
Showing 4 changed files with 25 additions and 6 deletions.
15 changes: 14 additions & 1 deletion .github/scripts/e2e-test-server.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ def TestLoadChatModel():
"prompt_template": "[INST] {prompt} [/INST]",
"llama_model_path": cwd + "/" + LLM_MODEL + '.gguf',
"model_alias": LLM_MODEL,
"ngl": 32,
"ngl": 33,
"caching_enabled": True
}

Expand Down Expand Up @@ -177,6 +177,18 @@ def TestChatCompletion():
CleanUp()
exit(1)

def TestLlmEmbeddings():
new_data = {
"input": "This PDFs was created using Microsoft Word using the print to PDF function. True PDFs consist of both text and images. We should think about these PDFs having two layers – one layer is the image and a second layer is the text. The image layer shows what the document will look like if it is printed to paper. The text layer is searchable text that is carried over from the original Word file into the new PDF file (the technical term for this layer is “extracted text”). There is no need to make it searchable and the new PDF will have the same text as the original Word file. An example of True PDFs that federal defenders and CJA panel attorneys will be familiar with are the pleadings filed in CM/ECF. The pleading is originally created in Word, but then the attorney either saves it as PDF or prints to PDF and they file that PDF document with the court. Using either process, there is now a PDF file created with an image layer plus text layer. In terms of usability, this is the best type of PDF to receive in discovery as it will have the closest to text searchability of the original file",
"model": LLM_MODEL,
"encoding_format": "float"
}
url_post = "http://127.0.0.1:"+ str(port) + "/v1/embeddings"
res = RequestPost(new_data, url_post)
if not res:
CleanUp()
exit(1)

def TestUnloadModel(model):
new_data = {
"model": model,
Expand Down Expand Up @@ -255,6 +267,7 @@ def TestEmbeddings():

TestLoadChatModel()
TestChatCompletion()
TestLlmEmbeddings()
TestUnloadModel(LLM_MODEL)
TestLoadEmbeddingModel()
TestEmbeddings()
Expand Down
2 changes: 1 addition & 1 deletion src/llama_engine.cc
Original file line number Diff line number Diff line change
Expand Up @@ -328,7 +328,7 @@ bool LlamaEngine::LoadModelImpl(std::shared_ptr<Json::Value> json_body) {

params.n_gpu_layers = json_body->get("ngl", 100).asInt();
params.n_ctx = json_body->get("ctx_len", 2048).asInt();
params.embedding = json_body->get("embedding", true).asBool();
params.embedding = json_body->get("embedding", false).asBool();
model_type = json_body->get("model_type", "llm").asString();
params.n_batch = json_body->get("n_batch", 2048).asInt();
params.n_ubatch = json_body->get("n_ubatch", params.n_batch).asInt();
Expand Down
12 changes: 9 additions & 3 deletions src/llama_server_context.cc
Original file line number Diff line number Diff line change
Expand Up @@ -165,13 +165,15 @@ bool LlamaServerContext::LoadModel(const gpt_params& params_) {

// https://github.com/ggerganov/llama.cpp/blob/master/examples/llava/README.md
// note llava-1.6 needs more context than llava-1.5, at least 3000 is needed (just run it at -c 4096)
if (params.n_ctx < 4096 && IsLlava_1_6(params.model)) {
if (params.n_ctx < 4096 && IsLlava_1_6(params.model)) {
params.n_ctx = 4096;
LOG_DEBUG << "Request " << params.n_ctx << " for context length for llava-1.6";
LOG_DEBUG << "Request " << params.n_ctx
<< " for context length for llava-1.6";
} else if (params.n_ctx <
2048) { // request larger context for the image embedding
params.n_ctx = 2048;
LOG_DEBUG << "Request " << params.n_ctx << " for context length for the image embedding";
LOG_DEBUG << "Request " << params.n_ctx
<< " for context length for the image embedding";
}
}

Expand Down Expand Up @@ -263,6 +265,10 @@ json LlamaServerContext::GetModelProps() {

int LlamaServerContext::RequestCompletion(json data, bool infill,
bool embedding, int multitask_id) {
// From this commit: 'llama : allow pooled embeddings on any model (#7477)'
// we need to explicitly set embedding flad for each request
llama_set_embeddings(ctx, embedding);

TaskServer task;
task.id = id_gen++;
task.target_id = 0;
Expand Down

0 comments on commit 186bcb2

Please sign in to comment.