Update llama.cpp submodule to latest release b3197 (#114)

* Update submodule to latest release b3197 * fix: make embedding works again * fix: update e2e python script --------- Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com> Co-authored-by: vansangpfiev <[email protected]>
janhq · Jun 22, 2024 · 186bcb2 · 186bcb2
1 parent 7857e05
commit 186bcb2
Show file tree

Hide file tree

Showing 4 changed files with 25 additions and 6 deletions.
diff --git a/.github/scripts/e2e-test-server.py b/.github/scripts/e2e-test-server.py
@@ -112,7 +112,7 @@ def TestLoadChatModel():
         "prompt_template": "[INST] {prompt} [/INST]",
         "llama_model_path": cwd + "/" + LLM_MODEL + '.gguf',
         "model_alias": LLM_MODEL,
-        "ngl": 32,
+        "ngl": 33,
         "caching_enabled": True
     }
 
@@ -177,6 +177,18 @@ def TestChatCompletion():
         CleanUp()
         exit(1)
 
+def TestLlmEmbeddings():
+    new_data = {
+        "input": "This PDFs was created using Microsoft Word using the print to PDF function. True PDFs consist of both text and images. We should think about these PDFs having two layers – one layer  is the image and a second layer is the text. The image layer shows what the document will look  like if it is printed to paper. The text layer is searchable text that is carried over from the original Word file into the new PDF file (the technical term for this layer is “extracted text”). There is no need to make it searchable and the new PDF will have the same text as the original Word file. An example of True PDFs that federal defenders and CJA panel attorneys will be familiar with are the pleadings filed in CM/ECF. The pleading is originally created in Word, but then the attorney either saves it as PDF or prints to PDF and they file that PDF document with the court. Using either process, there is now a PDF file created with an image layer plus text layer. In terms of usability, this is the best type of PDF to receive in discovery as it will have the closest to text searchability of the original file",
+        "model": LLM_MODEL,
+        "encoding_format": "float"
+    }
+    url_post = "http://127.0.0.1:"+ str(port) + "/v1/embeddings"
+    res = RequestPost(new_data, url_post)
+    if not res:
+        CleanUp()
+        exit(1)
+
 def TestUnloadModel(model):
     new_data = {
         "model": model,
@@ -255,6 +267,7 @@ def TestEmbeddings():
 
 TestLoadChatModel()
 TestChatCompletion()
+TestLlmEmbeddings()
 TestUnloadModel(LLM_MODEL)
 TestLoadEmbeddingModel()
 TestEmbeddings()

diff --git a/llama.cpp b/llama.cpp
diff --git a/src/llama_engine.cc b/src/llama_engine.cc
@@ -328,7 +328,7 @@ bool LlamaEngine::LoadModelImpl(std::shared_ptr<Json::Value> json_body) {
 
     params.n_gpu_layers = json_body->get("ngl", 100).asInt();
     params.n_ctx = json_body->get("ctx_len", 2048).asInt();
-    params.embedding = json_body->get("embedding", true).asBool();
+    params.embedding = json_body->get("embedding", false).asBool();
     model_type = json_body->get("model_type", "llm").asString();
     params.n_batch = json_body->get("n_batch", 2048).asInt();
     params.n_ubatch = json_body->get("n_ubatch", params.n_batch).asInt();

diff --git a/src/llama_server_context.cc b/src/llama_server_context.cc
@@ -165,13 +165,15 @@ bool LlamaServerContext::LoadModel(const gpt_params& params_) {
 
     // https://github.com/ggerganov/llama.cpp/blob/master/examples/llava/README.md
     // note llava-1.6 needs more context than llava-1.5, at least 3000 is needed (just run it at -c 4096)
-    if (params.n_ctx < 4096 && IsLlava_1_6(params.model)) {      
+    if (params.n_ctx < 4096 && IsLlava_1_6(params.model)) {
       params.n_ctx = 4096;
-      LOG_DEBUG << "Request " << params.n_ctx << " for context length for llava-1.6";
+      LOG_DEBUG << "Request " << params.n_ctx
+                << " for context length for llava-1.6";
     } else if (params.n_ctx <
                2048) {  // request larger context for the image embedding
       params.n_ctx = 2048;
-      LOG_DEBUG << "Request " << params.n_ctx << " for context length for the image embedding";
+      LOG_DEBUG << "Request " << params.n_ctx
+                << " for context length for the image embedding";
     }
   }
 
@@ -263,6 +265,10 @@ json LlamaServerContext::GetModelProps() {
 
 int LlamaServerContext::RequestCompletion(json data, bool infill,
                                           bool embedding, int multitask_id) {
+  // From this commit: 'llama : allow pooled embeddings on any model (#7477)'
+  // we need to explicitly set embedding flad for each request
+  llama_set_embeddings(ctx, embedding);
+
   TaskServer task;
   task.id = id_gen++;
   task.target_id = 0;
+4 −3		CMakeLists.txt
+23 −8		CMakePresets.json
+19 −11		README-sycl.md
+10 −2		common/common.cpp
+2 −2		convert-hf-to-gguf-update.py
+11 −10		examples/embedding/embedding.cpp
+4 −2		examples/gritlm/gritlm.cpp
+21 −14		examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift
+10 −3		examples/retrieval/retrieval.cpp
+3 −3		examples/sycl/win-build-sycl.bat
+698 −1		ggml-quants.c
+1 −1		ggml-sycl.cpp
+185 −229		ggml-sycl/dpct/helper.hpp
+20 −14		ggml-vulkan.cpp
+6 −0		ggml.h
+110 −59		llama.cpp
+5 −1		llama.h
+1 −1		requirements/requirements-convert-hf-to-gguf-update.txt
+1 −1		requirements/requirements-convert-hf-to-gguf.txt
+1 −1		requirements/requirements-convert-legacy-llama.txt
+1 −0		unicode.cpp