[Example] ggml: Support LLAMA3 (#131)

Signed-off-by: hydai <[email protected]>
second-state · Apr 19, 2024 · e8b8300 · e8b8300
1 parent 6e81ae3
commit e8b8300
Show file tree

Hide file tree

Showing 5 changed files with 79 additions and 10 deletions.
diff --git a/.github/workflows/llama.yml b/.github/workflows/llama.yml
@@ -114,6 +114,34 @@ jobs:
                 default \
                 $'[INST] <<SYS>>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you do not know the answer to a question, please do not share false information.\n<</SYS>>\nWhat is the capital of Japan?[/INST]'
 
+          - name: Llama3 8B
+            run: |
+              test -f ~/.wasmedge/env && source ~/.wasmedge/env
+              cd wasmedge-ggml/llama
+              curl -LO https://huggingface.co/QuantFactory/Meta-Llama-3-8B-Instruct-GGUF/resolve/main/Meta-Llama-3-8B-Instruct.Q5_K_M.gguf
+              cargo build --target wasm32-wasi --release
+              time wasmedge --dir .:. \
+                --env n_gpu_layers="$NGL" \
+                --env llama3=true \
+                --nn-preload default:GGML:AUTO:Meta-Llama-3-8B-Instruct.Q5_K_M.gguf \
+                target/wasm32-wasi/release/wasmedge-ggml-llama.wasm \
+                default \
+                $"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you do not know the answer to a question, please do not share false information.<|eot_id|>\n<|start_header_id|>user<|end_header_id|>\n\nWhat's the capital of Japan?<|eot_id|>\n<|start_header_id|>assistant<|end_header_id|>\n\n"
+
+          - name: Llama3 8B (Streaming)
+            run: |
+              test -f ~/.wasmedge/env && source ~/.wasmedge/env
+              cd wasmedge-ggml/llama-stream
+              curl -LO https://huggingface.co/QuantFactory/Meta-Llama-3-8B-Instruct-GGUF/resolve/main/Meta-Llama-3-8B-Instruct.Q5_K_M.gguf
+              cargo build --target wasm32-wasi --release
+              time wasmedge --dir .:. \
+                --env n_gpu_layers="$NGL" \
+                --env llama3=true \
+                --nn-preload default:GGML:AUTO:Meta-Llama-3-8B-Instruct.Q5_K_M.gguf \
+                target/wasm32-wasi/release/wasmedge-ggml-llama-stream.wasm \
+                default \
+                $"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you do not know the answer to a question, please do not share false information.<|eot_id|>\n<|start_header_id|>user<|end_header_id|>\n\nWhat's the capital of Japan?<|eot_id|>\n<|start_header_id|>assistant<|end_header_id|>\n\n"
+
           - name: StarCoder 2 7B
             run: |
               test -f ~/.wasmedge/env && source ~/.wasmedge/env

diff --git a/wasmedge-ggml/llama-stream/src/main.rs b/wasmedge-ggml/llama-stream/src/main.rs
@@ -27,6 +27,19 @@ fn get_options_from_env() -> Value {
     } else {
         options["enable-log"] = serde_json::from_str("false").unwrap()
     }
+    if let Ok(val) = env::var("llama3") {
+        options["llama3"] = serde_json::from_str(val.as_str())
+            .expect("invalid value for llama3 option (true/false)");
+    } else {
+        options["llama3"] = serde_json::from_str("false").unwrap()
+    }
+    if let Ok(val) = env::var("ctx_size") {
+        options["ctx-size"] =
+            serde_json::from_str(val.as_str()).expect("invalid ctx-size value (unsigned integer")
+    } else {
+        options["ctx-size"] = serde_json::from_str("1024").unwrap()
+    }
+
     if let Ok(val) = env::var("n_gpu_layers") {
         options["n-gpu-layers"] =
             serde_json::from_str(val.as_str()).expect("invalid ngl value (unsigned integer")
@@ -182,12 +195,23 @@ fn main() {
         println!("USER:");
         let input = read_input();
         if saved_prompt.is_empty() {
-            saved_prompt = format!(
-                "[INST] <<SYS>> {} <</SYS>> {} [/INST]",
-                system_prompt, input
-            );
+            if options["llama3"].as_bool().unwrap() {
+                saved_prompt = format!(
+                    "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{}<|eot_id|>\n<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>\n<|start_header_id|>assistant<|end_header_id|>\n\n",
+                    system_prompt, input
+                );
+            } else {
+                saved_prompt = format!(
+                    "[INST] <<SYS>> {} <</SYS>> {} [/INST]",
+                    system_prompt, input
+                );
+            }
         } else {
-            saved_prompt = format!("{} [INST] {} [/INST]", saved_prompt, input);
+            if options["llama3"].as_bool().unwrap() {
+                saved_prompt = format!("{}<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>\n<|start_header_id|>assistant<|end_header_id|>\n\n", saved_prompt, input);
+            } else {
+                saved_prompt = format!("{} [INST] {} [/INST]", saved_prompt, input);
+            }
         }
 
         // Set prompt to the input tensor.

diff --git a/wasmedge-ggml/llama-stream/wasmedge-ggml-llama-stream.wasm b/wasmedge-ggml/llama-stream/wasmedge-ggml-llama-stream.wasm
diff --git a/wasmedge-ggml/llama/src/main.rs b/wasmedge-ggml/llama/src/main.rs
@@ -27,6 +27,12 @@ fn get_options_from_env() -> Value {
     } else {
         options["enable-log"] = serde_json::from_str("false").unwrap()
     }
+    if let Ok(val) = env::var("llama3") {
+        options["llama3"] = serde_json::from_str(val.as_str())
+            .expect("invalid value for llama3 option (true/false)");
+    } else {
+        options["llama3"] = serde_json::from_str("false").unwrap()
+    }
     if let Ok(val) = env::var("n_gpu_layers") {
         options["n-gpu-layers"] =
             serde_json::from_str(val.as_str()).expect("invalid ngl value (unsigned integer")
@@ -147,12 +153,23 @@ fn main() {
         println!("USER:");
         let input = read_input();
         if saved_prompt.is_empty() {
-            saved_prompt = format!(
-                "[INST] <<SYS>> {} <</SYS>> {} [/INST]",
-                system_prompt, input
-            );
+            if options["llama3"].as_bool().unwrap() {
+                saved_prompt = format!(
+                    "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{}<|eot_id|>\n<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>\n<|start_header_id|>assistant<|end_header_id|>\n\n",
+                    system_prompt, input
+                );
+            } else {
+                saved_prompt = format!(
+                    "[INST] <<SYS>> {} <</SYS>> {} [/INST]",
+                    system_prompt, input
+                );
+            }
         } else {
-            saved_prompt = format!("{} [INST] {} [/INST]", saved_prompt, input);
+            if options["llama3"].as_bool().unwrap() {
+                saved_prompt = format!("{}<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>\n<|start_header_id|>assistant<|end_header_id|>\n\n", saved_prompt, input);
+            } else {
+                saved_prompt = format!("{} [INST] {} [/INST]", saved_prompt, input);
+            }
         }
 
         // Set prompt to the input tensor.

diff --git a/wasmedge-ggml/llama/wasmedge-ggml-llama.wasm b/wasmedge-ggml/llama/wasmedge-ggml-llama.wasm