diff --git a/.github/workflows/llama.yml b/.github/workflows/llama.yml index 079967a..a8c32ce 100644 --- a/.github/workflows/llama.yml +++ b/.github/workflows/llama.yml @@ -101,6 +101,19 @@ jobs: default \ $'[INST] <>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you do not know the answer to a question, please do not share false information.\n<>\nWhat is the capital of Japan?[/INST]' + - name: Llama2 7B (Streaming) + run: | + test -f ~/.wasmedge/env && source ~/.wasmedge/env + cd wasmedge-ggml/llama-stream + curl -LO https://huggingface.co/TheBloke/Llama-2-7b-Chat-GGUF/resolve/main/llama-2-7b-chat.Q5_K_M.gguf + cargo build --target wasm32-wasi --release + time wasmedge --dir .:. \ + --env n_gpu_layers="$NGL" \ + --nn-preload default:GGML:AUTO:llama-2-7b-chat.Q5_K_M.gguf \ + target/wasm32-wasi/release/wasmedge-ggml-llama-stream.wasm \ + default \ + $'[INST] <>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you do not know the answer to a question, please do not share false information.\n<>\nWhat is the capital of Japan?[/INST]' + - name: StarCoder 2 7B run: | test -f ~/.wasmedge/env && source ~/.wasmedge/env @@ -133,7 +146,7 @@ jobs: target/wasm32-wasi/release/wasmedge-ggml-multimodel.wasm \ 'describe this picture please' - - name: Embedding Example + - name: Embedding Example (All-MiniLM) run: | test -f ~/.wasmedge/env && source ~/.wasmedge/env cd wasmedge-ggml/embedding @@ -145,6 +158,18 @@ jobs: default \ 'hello world' + - name: Embedding Example (Llama-2) + run: | + test -f ~/.wasmedge/env && source ~/.wasmedge/env + cd wasmedge-ggml/embedding + curl -LO https://huggingface.co/TheBloke/Llama-2-7b-Chat-GGUF/resolve/main/llama-2-7b-chat.Q5_K_M.gguf + cargo build --target wasm32-wasi --release + time wasmedge --dir .:. \ + --nn-preload default:GGML:AUTO:llama-2-7b-chat.Q5_K_M.gguf \ + target/wasm32-wasi/release/wasmedge-ggml-llama-embedding.wasm \ + default \ + 'hello world' + - name: RPC Example run: | test -f ~/.wasmedge/env && source ~/.wasmedge/env @@ -171,6 +196,19 @@ jobs: default \ 'user Where is the capital of Japan? model' + - name: Grammar Example + run: | + test -f ~/.wasmedge/env && source ~/.wasmedge/env + cd wasmedge-ggml/grammar + curl -LO https://huggingface.co/TheBloke/Llama-2-7b-GGUF/resolve/main/llama-2-7b.Q5_K_M.gguf + cargo build --target wasm32-wasi --release + time wasmedge --dir .:. \ + --env n_gpu_layers="$NGL" \ + --nn-preload default:GGML:AUTO:llama-2-7b.Q5_K_M.gguf \ + target/wasm32-wasi/release/wasmedge-ggml-grammar.wasm \ + default \ + 'JSON object with 5 country names as keys and their capitals as values: ' + - name: Build llama-stream run: | cd wasmedge-ggml/llama-stream diff --git a/wasmedge-ggml/command-r/README.md b/wasmedge-ggml/command-r/README.md index f201459..f4c0052 100644 --- a/wasmedge-ggml/command-r/README.md +++ b/wasmedge-ggml/command-r/README.md @@ -10,23 +10,33 @@ ## Get Model +Here we use the `c4ai-command-r-plus-GGUF` model as an example. You can download the model from the Hugging Face model hub. + ```bash -curl -LO https://huggingface.co/andrewcanis/c4ai-command-r-v01-GGUF/resolve/main/c4ai-command-r-v01-Q5_K_M.gguf +curl -LO https://huggingface.co/pmysl/c4ai-command-r-plus-GGUF/resolve/main/command-r-plus-Q5_K_M-00001-of-00002.gguf +curl -LO https://huggingface.co/pmysl/c4ai-command-r-plus-GGUF/resolve/main/command-r-plus-Q5_K_M-00002-of-00002.gguf ``` ## Execute -```console +In this example, we use the system prompt with the definition of avaiable tools from [Example Rendered Tool Use Prompt](https://huggingface.co/CohereForAI/c4ai-command-r-plus). + +````console $ wasmedge --dir .:. \ - --nn-preload default:GGML:AUTO:c4ai-command-r-v01-Q5_K_M.gguf \ + --nn-preload default:GGML:AUTO:command-r-plus-Q5_K_M-00001-of-00002.gguf \ ./wasmedge-ggml-command-r.wasm default USER: -What's the capital of the United States? +Whats the biggest penguin in the world? ASSISTANT: -The capital of the United States is Washington, D.C. -USER: -How about Japan? -ASSISTANT: -Tokyo is the capital of Japan. -``` \ No newline at end of file +Action: ```json +[ + { + "tool_name": "internet_search", + "parameters": { + "query": "biggest penguin species" + } + } +] +``` +```` diff --git a/wasmedge-ggml/command-r/src/main.rs b/wasmedge-ggml/command-r/src/main.rs index d0d6ea4..d69d1cd 100644 --- a/wasmedge-ggml/command-r/src/main.rs +++ b/wasmedge-ggml/command-r/src/main.rs @@ -141,7 +141,51 @@ fn main() { } let mut saved_prompt = String::new(); - let system_prompt = String::from("You are a helpful, respectful and honest assistant. Always answer as short as possible, while being safe." ); + let system_tool_prompt = r#" + # Safety Preamble + The instructions in this section override those in the task description and style guide sections. Don't answer questions that are harmful or immoral. + + # System Preamble + ## Basic Rules + You are a powerful conversational AI trained by Cohere to help people. You are augmented by a number of tools, and your job is to use and consume the output of these tools to best help the user. You will see a conversation history between yourself and a user, ending with an utterance from the user. You will then see a specific instruction instructing you what kind of response to generate. When you answer the user's requests, you cite your sources in your answers, according to those instructions. + + # User Preamble + ## Task and Context + You help people answer their questions and other requests interactively. You will be asked a very wide array of requests on all kinds of topics. You will be equipped with a wide range of search engines or similar tools to help you, which you use to research your answer. You should focus on serving the user's needs as best you can, which will be wide-ranging. + + ## Style Guide + Unless the user asks for a different style of answer, you should answer in full sentences, using proper grammar and spelling. + + ## Available Tools + Here is a list of tools that you have available to you: + + ```python + def internet_search(query: str) -> List[Dict]: + """Returns a list of relevant document snippets for a textual query retrieved from the internet + + Args: + query (str): Query to search the internet with + """ + pass + ``` + + ```python + def directly_answer() -> List[Dict]: + """Calls a standard (un-augmented) AI chatbot to generate a response given the conversation history + """ + pass + ``` + "#; + let system_instruction_prompt = r#" + Write 'Action:' followed by a json-formatted list of actions that you want to perform in order to produce a good response to the user's last input. You can use any of the supplied tools any number of times, but you should aim to execute the minimum number of necessary actions for the input. You should use the `directly-answer` tool if calling the other tools is unnecessary. The list of actions you want to call should be formatted as a list of json objects, for example: + ```json + [ + { + "tool_name": title of the tool in the specification, + "parameters": a dict of parameters to input into the tool as they are defined in the specs, or {} if it takes no parameters + } + ]``` + "#; loop { println!("USER:"); @@ -149,8 +193,11 @@ fn main() { // if saved_prompt.is_empty() { saved_prompt = format!( - "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{} {}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", - system_prompt, input + "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{}<|END_OF_TURN_TOKEN|>\ + <|USER_TOKEN|>{}<|END_OF_TURN_TOKEN|>\ + <|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{}<|END_OF_TURN_TOKEN|>\ + <|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", + system_tool_prompt, input, system_instruction_prompt ); } else { saved_prompt = format!("{} <|START_OF_TURN_TOKEN|><|USER_TOKEN|>{}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", saved_prompt, input); diff --git a/wasmedge-ggml/command-r/wasmedge-ggml-command-r.wasm b/wasmedge-ggml/command-r/wasmedge-ggml-command-r.wasm index 90e358e..86074e2 100755 Binary files a/wasmedge-ggml/command-r/wasmedge-ggml-command-r.wasm and b/wasmedge-ggml/command-r/wasmedge-ggml-command-r.wasm differ diff --git a/wasmedge-ggml/grammar/Cargo.toml b/wasmedge-ggml/grammar/Cargo.toml new file mode 100644 index 0000000..7580459 --- /dev/null +++ b/wasmedge-ggml/grammar/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "wasmedge-ggml-grammar" +version = "0.1.0" +edition = "2021" + +[dependencies] +serde_json = "1.0" +wasmedge-wasi-nn = "0.7.0" diff --git a/wasmedge-ggml/grammar/README.md b/wasmedge-ggml/grammar/README.md new file mode 100644 index 0000000..a24c122 --- /dev/null +++ b/wasmedge-ggml/grammar/README.md @@ -0,0 +1,37 @@ +# Grammar Example For WASI-NN with GGML Backend + +> [!NOTE] +> Please refer to the [wasmedge-ggml/README.md](../README.md) for the general introduction and the setup of the WASI-NN plugin with GGML backend. This document will focus on the specific example of using grammar in ggml. + +## Get the Model + +In this example, we are going to use the [llama-2-7b](https://huggingface.co/TheBloke/Llama-2-7B-GGUF) model. Please note that we are not using a fine-tuned chat model. + +```bash +curl -LO https://huggingface.co/TheBloke/Llama-2-7B-GGUF/resolve/main/llama-2-7b.Q5_K_M.gguf +``` + +## Parameters + +> [!NOTE] +> Please check the parameters section of [wasmedge-ggml/README.md](https://github.com/second-state/WasmEdge-WASINN-examples/tree/master/wasmedge-ggml#parameters) first. + +In this example, we are going to use the `grammar` option to constrain the model to generate the JSON output in a specific format. + +You can check [the documents at llama.cpp](https://github.com/ggerganov/llama.cpp/tree/master/grammars) for more details about grammars. + +## Execute + +In this example, we are going to use the `n_predict` option to avoid the model from generating too many outputs. + +```console +$ wasmedge --dir .:. \ + --env n_predict=99 \ + --nn-preload default:GGML:AUTO:llama-2-7b.Q5_K_M.gguf \ + wasmedge-ggml-grammar.wasm default + +USER: +JSON object with 5 country names as keys and their capitals as values: +ASSISTANT: +{"US": "Washington", "UK": "London", "Germany": "Berlin", "France": "Paris", "Italy": "Rome"} +``` diff --git a/wasmedge-ggml/grammar/src/main.rs b/wasmedge-ggml/grammar/src/main.rs new file mode 100644 index 0000000..28bfbe5 --- /dev/null +++ b/wasmedge-ggml/grammar/src/main.rs @@ -0,0 +1,190 @@ +use serde_json::json; +use serde_json::Value; +use std::env; +use std::io; +use wasmedge_wasi_nn::{ + self, BackendError, Error, ExecutionTarget, GraphBuilder, GraphEncoding, GraphExecutionContext, + TensorType, +}; + +fn read_input() -> String { + loop { + let mut answer = String::new(); + io::stdin() + .read_line(&mut answer) + .expect("Failed to read line"); + if !answer.is_empty() && answer != "\n" && answer != "\r\n" { + return answer.trim().to_string(); + } + } +} + +fn get_options_from_env() -> Value { + let mut options = json!({}); + if let Ok(val) = env::var("enable_log") { + options["enable-log"] = serde_json::from_str(val.as_str()) + .expect("invalid value for enable-log option (true/false)") + } else { + options["enable-log"] = serde_json::from_str("false").unwrap() + } + if let Ok(val) = env::var("n_gpu_layers") { + options["n-gpu-layers"] = + serde_json::from_str(val.as_str()).expect("invalid ngl value (unsigned integer") + } else { + options["n-gpu-layers"] = serde_json::from_str("0").unwrap() + } + if let Ok(val) = env::var("n_predict") { + options["n-predict"] = + serde_json::from_str(val.as_str()).expect("invalid n-predict value (unsigned integer") + } + options["ctx-size"] = serde_json::from_str("1024").unwrap(); + + options +} + +fn set_data_to_context(context: &mut GraphExecutionContext, data: Vec) -> Result<(), Error> { + context.set_input(0, TensorType::U8, &[1], &data) +} + +#[allow(dead_code)] +fn set_metadata_to_context( + context: &mut GraphExecutionContext, + data: Vec, +) -> Result<(), Error> { + context.set_input(1, TensorType::U8, &[1], &data) +} + +fn get_data_from_context(context: &GraphExecutionContext, index: usize) -> String { + // Preserve for 4096 tokens with average token length 6 + const MAX_OUTPUT_BUFFER_SIZE: usize = 4096 * 6; + let mut output_buffer = vec![0u8; MAX_OUTPUT_BUFFER_SIZE]; + let mut output_size = context + .get_output(index, &mut output_buffer) + .expect("Failed to get output"); + output_size = std::cmp::min(MAX_OUTPUT_BUFFER_SIZE, output_size); + + return String::from_utf8_lossy(&output_buffer[..output_size]).to_string(); +} + +fn get_output_from_context(context: &GraphExecutionContext) -> String { + get_data_from_context(context, 0) +} + +fn get_metadata_from_context(context: &GraphExecutionContext) -> Value { + serde_json::from_str(&get_data_from_context(context, 1)).expect("Failed to get metadata") +} + +const JSON_GRAMMAR: &str = r#" +root ::= object +value ::= object | array | string | number | ("true" | "false" | "null") ws +object ::= + "{" ws ( + string ":" ws value + ("," ws string ":" ws value)* + )? "}" ws +array ::= + "[" ws ( + value + ("," ws value)* + )? "]" ws +string ::= + "\"" ( + [^"\\\x7F\x00-\x1F] | + "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) + )* "\"" ws +number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws +ws ::= ([ \t\n] ws)? +"#; + +fn main() { + let args: Vec = env::args().collect(); + let model_name: &str = &args[1]; + + // Set options for the graph. Check our README for more details: + // https://github.com/second-state/WasmEdge-WASINN-examples/tree/master/wasmedge-ggml#parameters + let mut options = get_options_from_env(); + + // Add grammar for JSON output. + // Check [here](https://github.com/ggerganov/llama.cpp/tree/master/grammars) for more details. + options["grammar"] = JSON_GRAMMAR.into(); + + // Make the output more consistent. + options["temp"] = json!(0.1); + + // Create graph and initialize context. + let graph = GraphBuilder::new(GraphEncoding::Ggml, ExecutionTarget::AUTO) + .config(serde_json::to_string(&options).expect("Failed to serialize options")) + .build_from_cache(model_name) + .expect("Failed to build graph"); + let mut context = graph + .init_execution_context() + .expect("Failed to init context"); + + // If there is a third argument, use it as the prompt and enter non-interactive mode. + // This is mainly for the CI workflow. + if args.len() >= 3 { + let prompt = &args[2]; + // Set the prompt. + println!("Prompt:\n{}", prompt); + let tensor_data = prompt.as_bytes().to_vec(); + context + .set_input(0, TensorType::U8, &[1], &tensor_data) + .expect("Failed to set input"); + println!("Response:"); + + // Get the number of input tokens and llama.cpp versions. + let input_metadata = get_metadata_from_context(&context); + println!("[INFO] llama_commit: {}", input_metadata["llama_commit"]); + println!( + "[INFO] llama_build_number: {}", + input_metadata["llama_build_number"] + ); + println!( + "[INFO] Number of input tokens: {}", + input_metadata["input_tokens"] + ); + + // Get the output. + context.compute().expect("Failed to compute"); + let output = get_output_from_context(&context); + println!("{}", output.trim()); + + // Retrieve the output metadata. + let metadata = get_metadata_from_context(&context); + println!( + "[INFO] Number of input tokens: {}", + metadata["input_tokens"] + ); + println!( + "[INFO] Number of output tokens: {}", + metadata["output_tokens"] + ); + std::process::exit(0); + } + + loop { + println!("USER:"); + let input = read_input(); + + // Set prompt to the input tensor. + set_data_to_context(&mut context, input.as_bytes().to_vec()).expect("Failed to set input"); + + // Execute the inference. + match context.compute() { + Ok(_) => (), + Err(Error::BackendError(BackendError::ContextFull)) => { + println!("\n[INFO] Context full, we'll reset the context and continue."); + } + Err(Error::BackendError(BackendError::PromptTooLong)) => { + println!("\n[INFO] Prompt too long, we'll reset the context and continue."); + } + Err(err) => { + println!("\n[ERROR] {}", err); + } + } + + // Retrieve the output. + let output = get_output_from_context(&context); + println!("ASSISTANT:\n{}", output.trim()); + } +} diff --git a/wasmedge-ggml/grammar/wasmedge-ggml-grammar.wasm b/wasmedge-ggml/grammar/wasmedge-ggml-grammar.wasm new file mode 100755 index 0000000..4cdf64f Binary files /dev/null and b/wasmedge-ggml/grammar/wasmedge-ggml-grammar.wasm differ diff --git a/wasmedge-ggml/llama-stream/src/main.rs b/wasmedge-ggml/llama-stream/src/main.rs index 38974dd..514b130 100644 --- a/wasmedge-ggml/llama-stream/src/main.rs +++ b/wasmedge-ggml/llama-stream/src/main.rs @@ -1,5 +1,5 @@ +use serde_json::json; use serde_json::Value; -use std::collections::HashMap; use std::env; use std::io::{self, Write}; use wasmedge_wasi_nn::{ @@ -19,6 +19,24 @@ fn read_input() -> String { } } +fn get_options_from_env() -> Value { + let mut options = json!({}); + if let Ok(val) = env::var("enable_log") { + options["enable-log"] = serde_json::from_str(val.as_str()) + .expect("invalid value for enable-log option (true/false)") + } else { + options["enable-log"] = serde_json::from_str("false").unwrap() + } + if let Ok(val) = env::var("n_gpu_layers") { + options["n-gpu-layers"] = + serde_json::from_str(val.as_str()).expect("invalid ngl value (unsigned integer") + } else { + options["n-gpu-layers"] = serde_json::from_str("0").unwrap() + } + + options +} + fn set_data_to_context(context: &mut GraphExecutionContext, data: Vec) -> Result<(), Error> { context.set_input(0, TensorType::U8, &[1], &data) } @@ -69,10 +87,7 @@ fn main() { // Set options for the graph. Check our README for more details: // https://github.com/second-state/WasmEdge-WASINN-examples/tree/master/wasmedge-ggml#parameters - let mut options = HashMap::new(); - options.insert("enable-log", Value::from(false)); - options.insert("n-gpu-layers", Value::from(0)); - options.insert("ctx-size", Value::from(512)); + let options = get_options_from_env(); // Create graph and initialize context. let graph = GraphBuilder::new(GraphEncoding::Ggml, ExecutionTarget::AUTO) @@ -94,6 +109,72 @@ fn main() { // ) // .expect("Failed to set metadata"); + // If there is a third argument, use it as the prompt and enter non-interactive mode. + // This is mainly for the CI workflow. + if args.len() >= 3 { + let prompt = &args[2]; + // Set the prompt. + println!("Prompt:\n{}", prompt); + let tensor_data = prompt.as_bytes().to_vec(); + context + .set_input(0, TensorType::U8, &[1], &tensor_data) + .expect("Failed to set input"); + println!("Response:"); + + // Get the number of input tokens and llama.cpp versions. + let input_metadata = get_metadata_from_context(&context); + println!("[INFO] llama_commit: {}", input_metadata["llama_commit"]); + println!( + "[INFO] llama_build_number: {}", + input_metadata["llama_build_number"] + ); + println!( + "[INFO] Number of input tokens: {}", + input_metadata["input_tokens"] + ); + + // Get the output. + let mut output = String::new(); + loop { + match context.compute_single() { + Ok(_) => (), + Err(Error::BackendError(BackendError::EndOfSequence)) => { + break; + } + Err(Error::BackendError(BackendError::ContextFull)) => { + println!("\n[INFO] Context full, we'll reset the context and continue."); + break; + } + Err(Error::BackendError(BackendError::PromptTooLong)) => { + println!("\n[INFO] Prompt too long, we'll reset the context and continue."); + break; + } + Err(err) => { + println!("\n[ERROR] {}", err); + std::process::exit(1); + } + } + // Retrieve the single output token and print it. + let token = get_single_output_from_context(&context); + print!("{}", token); + io::stdout().flush().unwrap(); + output += &token; + } + println!(); + + // Retrieve the output metadata. + let metadata = get_metadata_from_context(&context); + println!( + "[INFO] Number of input tokens: {}", + metadata["input_tokens"] + ); + println!( + "[INFO] Number of output tokens: {}", + metadata["output_tokens"] + ); + std::process::exit(0); + } + let mut saved_prompt = String::new(); let system_prompt = String::from("You are a helpful, respectful and honest assistant. Always answer as short as possible, while being safe." ); diff --git a/wasmedge-ggml/llama-stream/wasmedge-ggml-llama-stream.wasm b/wasmedge-ggml/llama-stream/wasmedge-ggml-llama-stream.wasm index 8e43c13..fc84d2b 100755 Binary files a/wasmedge-ggml/llama-stream/wasmedge-ggml-llama-stream.wasm and b/wasmedge-ggml/llama-stream/wasmedge-ggml-llama-stream.wasm differ