better docs for configuration

guidance-ai · Oct 29, 2024 · aa19c1c · aa19c1c
1 parent 9fd8000
commit aa19c1c
Show file tree

Hide file tree

Showing 3 changed files with 70 additions and 56 deletions.
diff --git a/README.md b/README.md
@@ -4,7 +4,7 @@ This project demonstrates how to use
 [llguidance library](https://github.com/microsoft/llguidance)
 for constrained output with
 [NVIDIA TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM),
-implementing a REST server compatible with 
+implementing a REST server compatible with
 [OpenAI APIs](https://platform.openai.com/docs/api-reference/introduction).
 
 The server supports regular completions and chat endpoints
@@ -48,7 +48,7 @@ It takes about 15 minutes on a GitHub runner, should be typically faster on a lo
 
 ### Building the TensorRT-LLM Engine
 
-This is following 
+This is following
 [TensorRT-LLM Quick-start](https://nvidia.github.io/TensorRT-LLM/quick-start-guide.html),
 adjusted for running in the `llgtrt/llgtrt` container.
 First, use the `llgtrt/llgtrt` container to run bash.
@@ -84,7 +84,7 @@ cp /models/Meta-Llama-3.1-8B-Instruct/tokenizer_config.json /models/model-engine
 exit
 ```
 
-Make sure to modify the path to the input model (it needs to contain the 
+Make sure to modify the path to the input model (it needs to contain the
 HF Transformers `config.json` as well as the `.safetensors` files and
 `tokenizer.json`).
 If you're running on more than one 1 GPU, modify the `--tp_size` argument.
@@ -99,34 +99,41 @@ The command will print out the actual `docker run` invocation on first line
 if you want to invoke it directly later.
 `PORT` defaults to 3000.
 
-### Update configuration
+### Update Configuration (optional)
 
-You can pass additional arguments after the engine path.
-Try running `./docker/run.sh /path/to/hf-models/model-engine --help` for more info.
-Most of the options are specified in configuration files,
-but which configuration files are used can be modified with command line arguments.
+The defaults should be mostly reasonable, but you can modify them.
+First, generate a template configuration file:
 
-By default, llgtrt will use chat template from `tokenizer_config.json`.
+```bash
+./docker/run.sh /path/to/hf-models/model-engine --print-config > llgtrt.json5
+```
 
-If present, it will also read `tokenizer_config_llgtrt.json` from the same directory
-and apply any keys from it to `tokenizer_config.json`.
-Afterwards, if `chat_template.j2` file is found, it will be used as the chat template.
+The file will contain commented out defaults for all supported options
+(JSON5 is a superset of JSON, so you can use comments).
+Edit it, and move to the engine folder.
 
-You can also modify TensortRT-LLM's runtime configuration with `runtime.json` file
-and `llguidance_parser` configuration with `llguidance.json`.
-This is optional, see below.
+To modify the chat template, you can either use `--print-complete-config`
+above which will include the chat template from `tokenizer_config.json`,
+or preferably create a separate `chat_template.j2` file in the engine folder:
 
+```bash
+./docker/run.sh /path/to/hf-models/model-engine --print-chat-template > chat_template.j2
+mv chat_template.j2 /path/to/hf-models/model-engine
+```
 
+The paths to `llgtrt.json5` and `chat_template.j2` are controlled by command
+line arguments, see `--help` for more info:
+
+```bash
+./docker/run.sh /path/to/hf-models/model-engine --help
+```
 
-The `--help` has up-to-date info on `runtime.json` file -
-the options can be specified either in these files (replace `-` with `_`)
-or on command line.
+You can even specify several JSON5 config files, and they will be merged
+in the order they are specified (with later ones overriding the earlier ones).
+This way, you can separate configuration for tokenizer, runtime, and guidance parser.
 
-The `llguidance.json` file contains `ParserLimits` structure
-under `limits` key (defaults should be generally good)
-and `log_level`, defaulting to `1` (warnings only);
-set it to `2` for debug logging from the parser
-or `0` to disable warnings.
+You can enable additional logging for llguidance by setting `llguidance.log_level` to `2`
+in the configuration file.
 
 ## Development
 
@@ -159,4 +166,4 @@ which has similar aims, but uses NVidia Triton Server wrapping TensorRT-LLM.
 - [ ] logprobs with argmax sampling and constraints
 - [ ] expose the 'surprise' measure somehow
 - [x] for tools, right now it forces a tool - we want to allow either message or tool
-- [ ] we use `<|python_tag|>` which is llama 3.1 specific; make it configurable
+- [ ] we use `<|python_tag|>` which is llama 3.1 specific; make it configurable
diff --git a/llgtrt/src/config.rs b/llgtrt/src/config.rs
@@ -8,6 +8,8 @@ pub fn config_info() -> serde_json::Value {
     serde_json::from_str(CONFIG_INFO).unwrap()
 }
 
+const CONFIG_OPTIONS: &str = "Configuration files handling";
+
 #[derive(Debug, Serialize, Deserialize)]
 pub struct TrtLlmRuntimeConfig {
     /// Make the scheduler more conservative, so that a started request is never evicted.
@@ -56,7 +58,7 @@ impl Default for TrtLlmRuntimeConfig {
 #[derive(Debug, Serialize, Deserialize, Default)]
 pub struct LlgTrtConfig {
     /// TensorRT-LLM runtime parameters
-    /// Defaults should be reasonable, otherwise see 
+    /// Defaults should be reasonable, otherwise see
     /// https://nvidia.github.io/TensorRT-LLM/performance/perf-best-practices.html
     pub runtime: TrtLlmRuntimeConfig,
 
@@ -87,24 +89,6 @@ pub struct CliConfig {
     #[arg(long, short = 'T')]
     pub tokenizer: Option<String>,
 
-    /// Path to JSON5 configuration file; multiple files are JSON-merged in order; defaults to:
-    /// <engine>/llgtrt.json5 if it exists
-    #[arg(long, short = 'C')]
-    pub config: Vec<String>,
-
-    /// Path to chat template file; defaults to <engine>/chat_template.j2 if it exists
-    /// Overrides values in all configs.
-    #[arg(long)]
-    pub chat_template: Option<String>,
-
-    /// When present, save the merged configuration to this file and exit; use '-' for stdout
-    #[arg(long)]
-    pub save_config: Option<String>,
-
-    /// Similar to --save-config, but includes chat template and tokenizer config
-    #[arg(long)]
-    pub save_complete_config: Option<String>,
-
     /// Debug output
     #[arg(long, short = 'd')]
     pub debug: bool,
@@ -117,4 +101,26 @@ pub struct CliConfig {
     #[arg(long)]
     #[serde(skip_serializing_if = "Option::is_none")]
     pub api_key: Option<String>,
+
+    /// Path to JSON5 configuration file; multiple files are JSON-merged in order; defaults to:
+    /// <engine>/llgtrt.json5 if it exists
+    #[arg(long, short = 'C', help_heading = CONFIG_OPTIONS)]
+    pub config: Vec<String>,
+
+    /// Path to chat template file; defaults to <engine>/chat_template.j2 if it exists
+    /// Overrides values in all configs.
+    #[arg(long, help_heading = CONFIG_OPTIONS)]
+    pub chat_template: Option<String>,
+
+    /// Print the merged configuration and exit
+    #[arg(long, help_heading = CONFIG_OPTIONS)]
+    pub print_config: bool,
+
+    /// Similar to --print-config, but includes chat template and tokenizer config
+    #[arg(long, help_heading = CONFIG_OPTIONS)]
+    pub print_complete_config: bool,
+
+    /// Print the chat template and exit
+    #[arg(long, help_heading = CONFIG_OPTIONS)]
+    pub print_chat_template: bool,
 }
diff --git a/llgtrt/src/startup.rs b/llgtrt/src/startup.rs
@@ -57,7 +57,7 @@ pub async fn run_server(mut cli_config: CliConfig) -> anyhow::Result<()> {
 
     let mut config = LlgTrtConfig::default();
 
-    if cli_config.save_config.is_some() {
+    if cli_config.print_config {
         log::info!("Skipping tokenizer config load");
     } else {
         let tokenizer_folder = cli_config.tokenizer.as_ref().unwrap_or(&cli_config.engine);
@@ -81,7 +81,7 @@ pub async fn run_server(mut cli_config: CliConfig) -> anyhow::Result<()> {
     let mut config: LlgTrtConfig =
         serde_json::from_value(config).map_err(|e| anyhow!("Error interpreting config: {}", e))?;
 
-    if cli_config.save_config.is_some() {
+    if cli_config.print_config {
         log::info!("Skipping separate chat template load");
     } else {
         let chat_template = cli_config
@@ -94,23 +94,24 @@ pub async fn run_server(mut cli_config: CliConfig) -> anyhow::Result<()> {
         }
     }
 
-    if let Some(filename) = cli_config
-        .save_config
-        .as_ref()
-        .or(cli_config.save_complete_config.as_ref())
-    {
+    if cli_config.print_config || cli_config.print_complete_config {
         let r = json5_to_string(
             &serde_json::to_value(&config)?,
             &serde_json::to_value(&LlgTrtConfig::default())?,
             &config_info(),
         );
-        if filename == "-" {
-            log::info!("Printing merged config to stdout");
-            println!("{}", r);
-        } else {
-            log::info!("Saving merged config to {}", filename);
-            std::fs::write(filename, r)?;
+        log::info!("Printing merged config to stdout");
+        println!("{}", r);
+        return Ok(());
+    }
+
+    if cli_config.print_chat_template {
+        log::info!("Printing chat template to stdout");
+        if config.tokenizer.chat_template.is_none() {
+            log::warn!("No chat template found");
+            return Ok(());
         }
+        print!("{}", config.tokenizer.chat_template.as_ref().unwrap());
         return Ok(());
     }