From 9fd800065b4fd0eec9cde0e1cc994d8a4d71a520 Mon Sep 17 00:00:00 2001
From: Michal Moskal <michal@moskal.me>
Date: Tue, 29 Oct 2024 21:35:18 +0000
Subject: [PATCH] reworking how to pass config files

---
 .gitignore                       |   1 -
 .vscode/settings.json            |  15 ++++
 Cargo.lock                       | 128 +++++++++++++++++++++++++++
 README.md                        |  36 ++++----
 llgtrt/Cargo.toml                |   1 +
 llgtrt/chat_templates/llama31.j2 |  92 ++++++++++++++++++++
 llgtrt/src/async_exec.rs         |  15 +++-
 llgtrt/src/chat.rs               |  27 +-----
 llgtrt/src/config.rs             | 112 ++++++++++++++++--------
 llgtrt/src/config_info.json      |  57 ++++++++++++
 llgtrt/src/constraint_mgr.rs     |  33 +++----
 llgtrt/src/jsonutil.rs           | 101 ++++++++++++++++++++++
 llgtrt/src/lib.rs                |   3 +-
 llgtrt/src/logging.rs            |   4 +-
 llgtrt/src/main.rs               |   4 +-
 llgtrt/src/startup.rs            | 143 ++++++++++++++++++-------------
 llgtrt/src/tokenizer.rs          |  82 ++++++++----------
 llguidance                       |   2 +-
 scripts/collect-comments.py      |  99 +++++++++++++++++++++
 scripts/regen.sh                 |   4 +
 20 files changed, 745 insertions(+), 214 deletions(-)
 create mode 100644 .vscode/settings.json
 create mode 100644 llgtrt/chat_templates/llama31.j2
 create mode 100644 llgtrt/src/config_info.json
 create mode 100644 llgtrt/src/jsonutil.rs
 create mode 100644 scripts/collect-comments.py
 create mode 100755 scripts/regen.sh

diff --git a/.gitignore b/.gitignore
index 94c01dc..d242cfb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,4 @@
 build
 tmp
-.vscode/settings.json
 target
 model.cache
diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000..5c1ff3b
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,15 @@
+{
+    "C_Cpp.autoAddFileAssociations": false,
+    "files.readonlyInclude": {
+        "**/config_info.json": true
+    },
+    "cSpell.words": [
+        "ckpt",
+        "fmha",
+        "llgtrt",
+        "mpirun",
+        "npuichigo",
+        "openai",
+        "trtllm"
+    ]
+}
\ No newline at end of file
diff --git a/Cargo.lock b/Cargo.lock
index 0d0fdde..da5f374 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -287,6 +287,15 @@ version = "2.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de"
 
+[[package]]
+name = "block-buffer"
+version = "0.10.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71"
+dependencies = [
+ "generic-array",
+]
+
 [[package]]
 name = "borrow-or-share"
 version = "0.2.2"
@@ -487,6 +496,15 @@ version = "0.8.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b"
 
+[[package]]
+name = "cpufeatures"
+version = "0.2.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "608697df725056feaccfa42cffdaeeec3fccc4ffc38358ecd19b243e716a78e0"
+dependencies = [
+ "libc",
+]
+
 [[package]]
 name = "crc32fast"
 version = "1.4.2"
@@ -521,6 +539,16 @@ version = "0.8.20"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80"
 
+[[package]]
+name = "crypto-common"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3"
+dependencies = [
+ "generic-array",
+ "typenum",
+]
+
 [[package]]
 name = "darling"
 version = "0.14.4"
@@ -602,6 +630,16 @@ dependencies = [
  "toml",
 ]
 
+[[package]]
+name = "digest"
+version = "0.10.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
+dependencies = [
+ "block-buffer",
+ "crypto-common",
+]
+
 [[package]]
 name = "dirs"
 version = "5.0.1"
@@ -805,6 +843,16 @@ dependencies = [
  "pin-utils",
 ]
 
+[[package]]
+name = "generic-array"
+version = "0.14.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a"
+dependencies = [
+ "typenum",
+ "version_check",
+]
+
 [[package]]
 name = "getrandom"
 version = "0.2.15"
@@ -1226,6 +1274,17 @@ dependencies = [
  "wasm-bindgen",
 ]
 
+[[package]]
+name = "json5"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "96b0db21af676c1ce64250b5f40f3ce2cf27e4e47cb91ed91eb6fe9350b430c1"
+dependencies = [
+ "pest",
+ "pest_derive",
+ "serde",
+]
+
 [[package]]
 name = "jsonschema"
 version = "0.24.0"
@@ -1307,6 +1366,7 @@ dependencies = [
  "clap",
  "flexi_logger",
  "futures-core",
+ "json5",
  "llguidance_parser",
  "log",
  "minijinja",
@@ -1723,6 +1783,51 @@ version = "2.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
 
+[[package]]
+name = "pest"
+version = "2.7.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "879952a81a83930934cbf1786752d6dedc3b1f29e8f8fb2ad1d0a36f377cf442"
+dependencies = [
+ "memchr",
+ "thiserror",
+ "ucd-trie",
+]
+
+[[package]]
+name = "pest_derive"
+version = "2.7.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d214365f632b123a47fd913301e14c946c61d1c183ee245fa76eb752e59a02dd"
+dependencies = [
+ "pest",
+ "pest_generator",
+]
+
+[[package]]
+name = "pest_generator"
+version = "2.7.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eb55586734301717aea2ac313f50b2eb8f60d2fc3dc01d190eefa2e625f60c4e"
+dependencies = [
+ "pest",
+ "pest_meta",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.77",
+]
+
+[[package]]
+name = "pest_meta"
+version = "2.7.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b75da2a70cf4d9cb76833c990ac9cd3923c9a8905a8929789ce347c84564d03d"
+dependencies = [
+ "once_cell",
+ "pest",
+ "sha2",
+]
+
 [[package]]
 name = "pin-project"
 version = "1.1.5"
@@ -2218,6 +2323,17 @@ dependencies = [
  "serde",
 ]
 
+[[package]]
+name = "sha2"
+version = "0.10.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "793db75ad2bcafc3ffa7c68b215fee268f537982cd901d132f89c6343f3a3dc8"
+dependencies = [
+ "cfg-if",
+ "cpufeatures",
+ "digest",
+]
+
 [[package]]
 name = "shlex"
 version = "1.3.0"
@@ -2595,6 +2711,18 @@ dependencies = [
  "serde",
 ]
 
+[[package]]
+name = "typenum"
+version = "1.17.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825"
+
+[[package]]
+name = "ucd-trie"
+version = "0.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2896d95c02a80c6d6a5d6e953d479f5ddf2dfdb6a244441010e373ac0fb88971"
+
 [[package]]
 name = "unicode-bidi"
 version = "0.3.15"
diff --git a/README.md b/README.md
index 78a3ebc..634d887 100644
--- a/README.md
+++ b/README.md
@@ -4,15 +4,15 @@ This project demonstrates how to use
 [llguidance library](https://github.com/microsoft/llguidance)
 for constrained output with
 [NVIDIA TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM),
-implementing a server with 
-[OpenAI REST API](https://platform.openai.com/docs/api-reference/introduction).
+implementing a REST server compatible with 
+[OpenAI APIs](https://platform.openai.com/docs/api-reference/introduction).
 
 The server supports regular completions and chat endpoints
-with JSON with schema enforcement ("Structured Output" in OpenAI docs),
+with JSON with schema enforcement ("Structured Output"),
 as well as full context-free grammars using [Guidance library](https://github.com/guidance-ai/guidance).
 
 This server is similar in spirit to [TensorRT-LLM OpenAI server example](./TensorRT-LLM/examples/apps/openai_server.py),
-but python-free and with support for constrained output.
+but is Python-free (implemented in Rust) and with support for constrained output.
 Similarly to the example above, it **does not** use the NVIDIA Triton Inference Server.
 
 ## Requirements
@@ -89,17 +89,6 @@ HF Transformers `config.json` as well as the `.safetensors` files and
 `tokenizer.json`).
 If you're running on more than one 1 GPU, modify the `--tp_size` argument.
 
-### Create config files
-
-By default, llgtrt will use chat template from `tokenizer_config.json`.
-If present, it will also read `tokenizer_config_llgtrt.json` from the same directory
-and apply any keys from it to `tokenizer_config.json`.
-Afterwards, if `chat_template.j2` file is found, it will be used as the chat template.
-
-You can also modify TensortRT-LLM's runtime configuration with `runtime.json` file
-and `llguidance_parser` configuration with `llguidance.json`.
-This is optional, see below.
-
 ### Running the Engine
 
 ```bash
@@ -110,8 +99,25 @@ The command will print out the actual `docker run` invocation on first line
 if you want to invoke it directly later.
 `PORT` defaults to 3000.
 
+### Update configuration
+
 You can pass additional arguments after the engine path.
 Try running `./docker/run.sh /path/to/hf-models/model-engine --help` for more info.
+Most of the options are specified in configuration files,
+but which configuration files are used can be modified with command line arguments.
+
+By default, llgtrt will use chat template from `tokenizer_config.json`.
+
+If present, it will also read `tokenizer_config_llgtrt.json` from the same directory
+and apply any keys from it to `tokenizer_config.json`.
+Afterwards, if `chat_template.j2` file is found, it will be used as the chat template.
+
+You can also modify TensortRT-LLM's runtime configuration with `runtime.json` file
+and `llguidance_parser` configuration with `llguidance.json`.
+This is optional, see below.
+
+
+
 The `--help` has up-to-date info on `runtime.json` file -
 the options can be specified either in these files (replace `-` with `_`)
 or on command line.
diff --git a/llgtrt/Cargo.toml b/llgtrt/Cargo.toml
index 3b7ea37..1db729f 100644
--- a/llgtrt/Cargo.toml
+++ b/llgtrt/Cargo.toml
@@ -23,3 +23,4 @@ rayon = "1.10.0"
 futures-core = "0.3.30"
 minijinja = { version = "2.3.1", features = ["preserve_order", "loop_controls", "loader"] }
 chrono = "0.4.38"
+json5 = "0.4.1"
diff --git a/llgtrt/chat_templates/llama31.j2 b/llgtrt/chat_templates/llama31.j2
new file mode 100644
index 0000000..0829896
--- /dev/null
+++ b/llgtrt/chat_templates/llama31.j2
@@ -0,0 +1,92 @@
+{#- This is adapted from huggingface tokenizer_config.json/chat_template but updated to match #}
+{#- https://github.com/meta-llama/llama-models/blob/main/models/llama3_1/prompt_format.md #}
+{{- bos_token }}
+{%- if custom_tools is defined %}
+    {%- set tools = custom_tools %}
+{%- endif %}
+{%- if not date_string is defined %}
+    {%- set date_string = "21 September 2024" %}
+{%- endif %}
+{%- if not tools is defined %}
+    {%- set tools = none %}
+{%- endif %}
+
+{#- This block extracts the system message, so we can slot it into the right place. #}
+{%- if messages[0]['role'] == 'system' %}
+    {%- set system_message = messages[0]['content']|trim %}
+    {%- set messages = messages[1:] %}
+{%- else %}
+    {%- set system_message = "" %}
+{%- endif %}
+
+{#- System message + builtin tools #}
+{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
+{%- if builtin_tools is defined or tools is not none %}
+    {{- "Environment: ipython\n" }}
+{%- endif %}
+{%- if builtin_tools is defined %}
+    {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}}
+{%- endif %}
+{{- "Cutting Knowledge Date: December 2023\n" }}
+{{- "Today Date: " + date_string + "\n\n" }}
+{{- system_message }}
+{{- "<|eot_id|>" }}
+
+{#- Custom tools are passed in a user message with some extra guidance #}
+{%- if not tools is none %}
+    {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}
+    {{- "Answer the user's question by making use of the following functions if needed.\n" }}
+    {{- "If none of the function can be used, please say so.\n" }}
+    {{- "Here is a list of functions in JSON format:\n" }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+    {{- "<|eot_id|>"}}
+{%- endif %}
+
+{%- for message in messages %}
+    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
+        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}
+    {%- elif 'tool_calls' in message %}
+        {%- if not message.tool_calls|length == 1 %}
+            {{- raise_exception("This model only supports single tool-calls at once!") }}
+        {%- endif %}
+        {%- set tool_call = message.tool_calls[0].function %}
+        {%- if builtin_tools is defined and tool_call.name in builtin_tools %}
+            {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
+            {{- "<|python_tag|>" + tool_call.name + ".call(" }}
+            {%- for arg_name, arg_val in tool_call.arguments | items %}
+                {{- arg_name + '="' + arg_val + '"' }}
+                {%- if not loop.last %}
+                    {{- ", " }}
+                {%- endif %}
+                {%- endfor %}
+            {{- ")" }}
+        {%- else  %}
+            {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
+            {{- '<|python_tag|>{"name": "' + tool_call.name + '", ' }}
+            {{- '"parameters": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- "}" }}
+        {%- endif %}
+        {%- if builtin_tools is defined %}
+            {#- This means we're in ipython mode #}
+            {{- "<|eom_id|>" }}
+        {%- else %}
+            {{- "<|eot_id|>" }}
+        {%- endif %}
+    {%- elif message.role == "tool" or message.role == "ipython" %}
+        {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
+        {%- if message.content is mapping or message.content is iterable %}
+            {{- message.content | tojson }}
+        {%- else %}
+            {{- message.content }}
+        {%- endif %}
+        {{- "<|eot_id|>" }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
+{%- endif %}
+
diff --git a/llgtrt/src/async_exec.rs b/llgtrt/src/async_exec.rs
index edbdf92..4fb57f3 100644
--- a/llgtrt/src/async_exec.rs
+++ b/llgtrt/src/async_exec.rs
@@ -16,7 +16,12 @@ use trtllm_rs::{
     TlcLogitsEntry,
 };
 
-use crate::{chat::ChatBuilder, config::Config, routes::openai::FinishReason, tokenizer::setup_tokenizer};
+use crate::{
+    chat::ChatBuilder,
+    config::{CliConfig, LlgTrtConfig},
+    routes::openai::FinishReason,
+    tokenizer::setup_tokenizer,
+};
 
 pub struct StepResults {
     pub response: ResponseChunk,
@@ -330,7 +335,11 @@ impl AsyncExecutor {
         self.executor.cancel_request(req_id)
     }
 
-    pub fn new(cli_config: &Config, mut executor_init: ExecutorInit) -> Result<(Self, TokEnv, ChatBuilder)> {
+    pub fn new(
+        cli_config: &CliConfig,
+        config: &LlgTrtConfig,
+        mut executor_init: ExecutorInit,
+    ) -> Result<(Self, TokEnv, ChatBuilder)> {
         executor_init.logits_callback = Some(logits_processor);
         let max_batch_size = executor_init.trt_params.max_batch_size as usize;
         log::info!("new executor: max_batch_size={max_batch_size}");
@@ -340,7 +349,7 @@ impl AsyncExecutor {
         executor.check_mpi();
 
         // only setup tokenizer on rank 0
-        let (tok_env, chat_builder) = setup_tokenizer(cli_config)?;
+        let (tok_env, chat_builder) = setup_tokenizer(cli_config, config)?;
         let trie = tok_env.tok_trie();
         let n_vocab = trie.vocab_size();
 
diff --git a/llgtrt/src/chat.rs b/llgtrt/src/chat.rs
index 7c90278..32fa4d6 100644
--- a/llgtrt/src/chat.rs
+++ b/llgtrt/src/chat.rs
@@ -1,4 +1,5 @@
 use crate::{
+    jsonutil,
     routes::openai::{ChatCompletionMessageContentPart, ChatCompletionMessageParams, Tool},
     tokenizer::TokenizerConfig,
 };
@@ -6,15 +7,6 @@ use anyhow::anyhow;
 use minijinja::{value::Kwargs, Environment, Error, ErrorKind, Value};
 use serde::{Deserialize, Serialize};
 
-const DEFAULT_TEMPLATE: &str = r#"{{- bos_token }}
-{%- for message in messages %}
-    {{- '<|' + message['role'] + |>\n' }}
-    {{- message['content'] + eos_token }}
-{%- endfor %}
-{%- if add_generation_prompt %}
-    {{- '<|assistant|>\n' }}
-{%- endif %}"#;
-
 pub struct ChatBuilder {
     default_context: TemplateContext,
     env: Environment<'static>,
@@ -48,19 +40,6 @@ fn date_string() -> String {
     chrono::Utc::now().format("%e %B %Y").to_string()
 }
 
-fn remove_null(v: &mut serde_json::Value) {
-    if let Some(map) = v.as_object_mut() {
-        for (_, v) in map.iter_mut() {
-            remove_null(v);
-        }
-        map.retain(|_, v| !v.is_null());
-    }
-    // remove empty arrays
-    if let Some(arr) = v.as_array_mut() {
-        arr.iter_mut().for_each(remove_null);
-    }
-}
-
 fn tojson(value: Value, args: Kwargs) -> Result<Value, Error> {
     let indent = match args.get::<usize>("indent") {
         Ok(val) => val,
@@ -113,7 +92,7 @@ impl ChatBuilder {
         let template = config
             .chat_template
             .clone()
-            .unwrap_or_else(|| DEFAULT_TEMPLATE.to_string());
+            .expect("chat_template should be set in TokenizerConfig");
         log::info!("chat template:\n{}", template);
         env.add_template_owned("chat", template)
             .map_err(|e| anyhow!("error parsing chat_template: {}", e))?;
@@ -147,7 +126,7 @@ impl ChatBuilder {
             context.tools = Some(params.tools.clone());
         }
         let mut context = serde_json::to_value(&context)?;
-        remove_null(&mut context);
+        jsonutil::remove_null(&mut context);
         let r = self
             .env
             .get_template("chat")
diff --git a/llgtrt/src/config.rs b/llgtrt/src/config.rs
index b1ee8a3..075e93f 100644
--- a/llgtrt/src/config.rs
+++ b/llgtrt/src/config.rs
@@ -1,45 +1,76 @@
-use clap::{Args, Parser};
+use clap::Parser;
 use serde::{Deserialize, Serialize};
 
-const TRT_CONFIG: &str = "TensorRT-LLM runtime config (runtime.json)";
+use crate::{constraint_mgr::LlgConfig, tokenizer::TokenizerConfig};
 
-#[derive(Args, Debug, Serialize, Deserialize)]
+const CONFIG_INFO: &str = include_str!("config_info.json");
+pub fn config_info() -> serde_json::Value {
+    serde_json::from_str(CONFIG_INFO).unwrap()
+}
+
+#[derive(Debug, Serialize, Deserialize)]
 pub struct TrtLlmRuntimeConfig {
-    /// When set to true, the scheduler is more conservative, so that a started request is never evicted; defaults to false (which improves throughput)
-    #[clap(long, help_heading = TRT_CONFIG)]
-    pub guaranteed_no_evict: Option<bool>,
+    /// Make the scheduler more conservative, so that a started request is never evicted.
+    /// Defaults to false (which improves throughput)
+    pub guaranteed_no_evict: bool,
+
+    /// Maximum number of concurrent requests
+    pub max_batch_size: usize,
+
+    /// Maximum number of tokens in batch
+    pub max_num_tokens: usize,
 
-    /// Maximum number of concurrent requests; defaults to 128
-    #[clap(long, help_heading = TRT_CONFIG)]
-    pub max_batch_size: Option<usize>,
+    /// Maximum number of requests in queue (when batch already full)
+    pub max_queue_size: usize,
 
-    /// Maximum number of tokens in batch; defaults to 8192
-    #[clap(long, help_heading = TRT_CONFIG)]
-    pub max_num_tokens: Option<usize>,
+    /// Chunk prefill/generation into pieces
+    /// Defaults to true (unlike trtllm)
+    pub enable_chunked_context: bool,
 
-    /// Maximum number of requests in queue (when batch already full); defaults to 0
-    #[clap(long, help_heading = TRT_CONFIG)]
-    pub max_queue_size: Option<usize>,
+    /// Prefix-caching (LRU-reuse blocks between requests)
+    /// Defaults to true (unlike trtllm)
+    pub enable_kv_cache_reuse: bool,
+
+    /// Fraction of free GPU memory to use for KV cache
+    pub kv_cache_free_gpu_mem_fraction: f32,
+
+    /// Host memory to use for KV cache
+    pub kv_cache_host_memory_megabytes: usize,
+}
 
-    /// Chunk prefill/generation into pieces; defaults to true (unlike trtllm)
-    #[clap(long, help_heading = TRT_CONFIG)]
-    pub enable_chunked_context: Option<bool>,
+impl Default for TrtLlmRuntimeConfig {
+    fn default() -> Self {
+        Self {
+            guaranteed_no_evict: false,
+            max_batch_size: 128,
+            max_num_tokens: 8192,
+            max_queue_size: 0,
+            enable_chunked_context: true,
+            enable_kv_cache_reuse: true,
+            kv_cache_free_gpu_mem_fraction: 0.9,
+            kv_cache_host_memory_megabytes: 0,
+        }
+    }
+}
 
-    /// Prefix-caching (LRU-reuse blocks between requests); defaults to true (unlike trtllm)
-    #[clap(long, help_heading = TRT_CONFIG)]
-    pub enable_kv_cache_reuse: Option<bool>,
+#[derive(Debug, Serialize, Deserialize, Default)]
+pub struct LlgTrtConfig {
+    /// TensorRT-LLM runtime parameters
+    /// Defaults should be reasonable, otherwise see 
+    /// https://nvidia.github.io/TensorRT-LLM/performance/perf-best-practices.html
+    pub runtime: TrtLlmRuntimeConfig,
 
-    /// Fraction of free GPU memory to use for KV cache; defaults to 0.9
-    #[clap(long, help_heading = TRT_CONFIG)]
-    pub kv_cache_free_gpu_mem_fraction: Option<f32>,
+    /// Tokenizer configuration (defaults to tokenizer_config.json contents)
+    /// Typically no changes are needed here, except for chat_template
+    /// which is best overridden with --chat-template filename.j2 option.
+    pub tokenizer: TokenizerConfig,
 
-    /// Host memory to use for KV cache; defaults to 0
-    #[clap(long, help_heading = TRT_CONFIG)]
-    pub kv_cache_host_memory_megabytes: Option<usize>,
+    /// Configuration for the LLGuidance constraint library
+    pub llguidance: LlgConfig,
 }
 
 #[derive(Parser, Debug, Serialize, Deserialize)]
-pub struct Config {
+pub struct CliConfig {
     /// Host to bind to
     #[arg(long, short = 'H', default_value_t = String::from("0.0.0.0"))]
     pub host: String,
@@ -56,13 +87,23 @@ pub struct Config {
     #[arg(long, short = 'T')]
     pub tokenizer: Option<String>,
 
-    /// Path to JSON file TensorRT-LLM runtime config; defaults to runtime.json in engine dir
-    #[arg(long, short = 'R')]
-    pub runtime_config: Option<String>,
+    /// Path to JSON5 configuration file; multiple files are JSON-merged in order; defaults to:
+    /// <engine>/llgtrt.json5 if it exists
+    #[arg(long, short = 'C')]
+    pub config: Vec<String>,
+
+    /// Path to chat template file; defaults to <engine>/chat_template.j2 if it exists
+    /// Overrides values in all configs.
+    #[arg(long)]
+    pub chat_template: Option<String>,
 
-    /// Path to JSON file with llguidance library config; defaults to llguidance.json in engine dir
-    #[arg(long, short = 'L')]
-    pub llguidance_config: Option<String>,
+    /// When present, save the merged configuration to this file and exit; use '-' for stdout
+    #[arg(long)]
+    pub save_config: Option<String>,
+
+    /// Similar to --save-config, but includes chat template and tokenizer config
+    #[arg(long)]
+    pub save_complete_config: Option<String>,
 
     /// Debug output
     #[arg(long, short = 'd')]
@@ -72,9 +113,6 @@ pub struct Config {
     #[arg(long, short = 'q')]
     pub quiet: bool,
 
-    #[clap(flatten)]
-    pub runtime_config_inline: TrtLlmRuntimeConfig,
-
     /// Api Key to access the server
     #[arg(long)]
     #[serde(skip_serializing_if = "Option::is_none")]
diff --git a/llgtrt/src/config_info.json b/llgtrt/src/config_info.json
new file mode 100644
index 0000000..028b2c6
--- /dev/null
+++ b/llgtrt/src/config_info.json
@@ -0,0 +1,57 @@
+{
+ "##info##": "Use scripts/regen.sh to re-generate this file",
+ "runtime": {
+  "#": "TensorRT-LLM runtime parameters\nDefaults should be reasonable, otherwise see\nhttps://nvidia.github.io/TensorRT-LLM/performance/perf-best-practices.html",
+  "guaranteed_no_evict": {
+   "#": "Make the scheduler more conservative, so that a started request is never evicted.\nDefaults to false (which improves throughput)"
+  },
+  "max_batch_size": {
+   "#": "Maximum number of concurrent requests"
+  },
+  "max_num_tokens": {
+   "#": "Maximum number of tokens in batch"
+  },
+  "max_queue_size": {
+   "#": "Maximum number of requests in queue (when batch already full)"
+  },
+  "enable_chunked_context": {
+   "#": "Chunk prefill/generation into pieces\nDefaults to true (unlike trtllm)"
+  },
+  "enable_kv_cache_reuse": {
+   "#": "Prefix-caching (LRU-reuse blocks between requests)\nDefaults to true (unlike trtllm)"
+  },
+  "kv_cache_free_gpu_mem_fraction": {
+   "#": "Fraction of free GPU memory to use for KV cache"
+  },
+  "kv_cache_host_memory_megabytes": {
+   "#": "Host memory to use for KV cache"
+  }
+ },
+ "tokenizer": {
+  "#": "Tokenizer configuration (defaults to tokenizer_config.json contents)\nTypically no changes are needed here, except for chat_template\nwhich is best overridden with --chat-template filename.j2 option."
+ },
+ "llguidance": {
+  "#": "Configuration for the LLGuidance constraint library",
+  "limits": {
+   "#": "Override any of the parser limits.",
+   "max_items_in_row": {
+    "#": "For non-ambiguous grammars, this is the maximum \"branching factor\" of the grammar.\nFor ambiguous grammars, this might get hit much quicker.\nDefault: 200"
+   },
+   "initial_lexer_fuel": {
+    "#": "How much \"fuel\" are we willing to spend to build initial lexer regex AST nodes.\nDefault: 1_000_000 (~20ms)"
+   },
+   "step_lexer_fuel": {
+    "#": "Maximum lexer fuel for computation of the whole token mask.\nDefault: 500_000 (~10ms)"
+   },
+   "max_lexer_states": {
+    "#": "Maximum number of lexer states.\nDefault: 10_000"
+   },
+   "max_grammar_size": {
+    "#": "Maximum size of the grammar (symbols in productions)\nDefault: 500_000 (a few megabytes of JSON)"
+   }
+  },
+  "log_level": {
+   "#": "Log level which goes to stderr. In-memory logs per-sequence are managed by ConstraintInit.log_level."
+  }
+ }
+}
\ No newline at end of file
diff --git a/llgtrt/src/constraint_mgr.rs b/llgtrt/src/constraint_mgr.rs
index 63aa7df..63d04f0 100644
--- a/llgtrt/src/constraint_mgr.rs
+++ b/llgtrt/src/constraint_mgr.rs
@@ -13,7 +13,16 @@ pub struct LlgConfig {
     pub limits: ParserLimits,
 
     /// Log level which goes to stderr. In-memory logs per-sequence are managed by ConstraintInit.log_level.
-    pub log_level: Option<u32>,
+    pub log_level: u32,
+}
+
+impl Default for LlgConfig {
+    fn default() -> Self {
+        Self {
+            limits: ParserLimits::default(),
+            log_level: 1,
+        }
+    }
 }
 
 pub struct ConstraintInit {
@@ -31,23 +40,7 @@ pub struct ConstraintMgr {
 }
 
 impl ConstraintMgr {
-    pub fn new(
-        tok_env: TokEnv,
-        chat_tok_env: TokEnv,
-        mut config: serde_json::Value,
-    ) -> Result<Self> {
-        let defl_limits = serde_json::to_value(ParserLimits::default()).unwrap();
-        if let Some(obj) = config["limits"].as_object_mut() {
-            for (k, v) in defl_limits.as_object().unwrap() {
-                if !obj.contains_key(k) {
-                    obj.insert(k.clone(), v.clone());
-                }
-            }
-        } else {
-            config["limits"] = defl_limits;
-        }
-        let config: LlgConfig = serde_json::from_value(config)?;
-
+    pub fn new(tok_env: TokEnv, chat_tok_env: TokEnv, config: &LlgConfig) -> Result<Self> {
         Ok(ConstraintMgr {
             tok_env,
             chat_tok_env,
@@ -56,8 +49,8 @@ impl ConstraintMgr {
                 backtrack: false, // unlikely
                 ..Default::default()
             },
-            parser_limits: config.limits,
-            log_stderr_level: config.log_level.unwrap_or(1),
+            parser_limits: config.limits.clone(),
+            log_stderr_level: config.log_level,
         })
     }
 
diff --git a/llgtrt/src/jsonutil.rs b/llgtrt/src/jsonutil.rs
new file mode 100644
index 0000000..1369cd5
--- /dev/null
+++ b/llgtrt/src/jsonutil.rs
@@ -0,0 +1,101 @@
+use serde_json::Value;
+
+pub fn remove_null(v: &mut serde_json::Value) {
+    if let Some(map) = v.as_object_mut() {
+        for (_, v) in map.iter_mut() {
+            remove_null(v);
+        }
+        map.retain(|_, v| !v.is_null());
+    }
+    // remove empty arrays
+    if let Some(arr) = v.as_array_mut() {
+        arr.iter_mut().for_each(remove_null);
+    }
+}
+
+pub fn json_merge(a: &mut Value, b: &Value) {
+    match (a, b) {
+        (Value::Object(a), Value::Object(b)) => {
+            for (k, v) in b.iter() {
+                json_merge(a.entry(k.clone()).or_insert(Value::Null), v);
+            }
+        }
+        (a, b) => *a = b.clone(),
+    }
+}
+
+fn write_indent(indent: usize, dst: &mut String) {
+    for _ in 0..indent {
+        dst.push(' ');
+    }
+}
+
+fn write_comment(indent: usize, dst: &mut String, comment: Option<&str>) {
+    if let Some(comment) = comment {
+        for line in comment.lines() {
+            write_indent(indent, dst);
+            dst.push_str("/// ");
+            dst.push_str(line);
+            dst.push('\n');
+        }
+    }
+}
+
+const INDENT_LEVEL: usize = 2;
+
+fn same_default(v: &Value, default: &Value) -> bool {
+    match default {
+        Value::Object(_) | Value::Array(_) => false,
+        _ => v == default,
+    }
+}
+
+fn json5_write(indent: usize, dst: &mut String, v: &Value, default: &Value, info: &Value) {
+    match v {
+        Value::Object(map) => {
+            if map.is_empty() {
+                dst.push_str("{}");
+                return;
+            }
+            dst.push_str("{");
+            for (k, v) in map.iter() {
+                dst.push_str("\n");
+                write_comment(indent + INDENT_LEVEL, dst, info[k]["#"].as_str());
+                write_indent(indent + INDENT_LEVEL, dst);
+                if same_default(v, &default[k]) {
+                    dst.push_str("//");
+                }
+                dst.push_str(&serde_json::to_string_pretty(k).unwrap());
+                dst.push_str(": ");
+                json5_write(indent + INDENT_LEVEL, dst, v, &default[k], &info[k]);
+                dst.push_str(",\n");
+            }
+            write_indent(indent, dst);
+            dst.push('}');
+        }
+        Value::Array(arr) => {
+            if arr.is_empty() {
+                dst.push_str("[]");
+                return;
+            }
+            dst.push_str("[\n");
+            for v in arr.iter() {
+                write_indent(indent + INDENT_LEVEL, dst);
+                json5_write(indent + INDENT_LEVEL, dst, v, &Value::Null, info);
+                dst.push_str(",\n");
+            }
+            write_indent(indent, dst);
+            dst.push(']');
+        }
+        Value::String(_) | Value::Number(_) | Value::Bool(_) | Value::Null => {
+            dst.push_str(&serde_json::to_string_pretty(v).unwrap());
+        }
+    }
+}
+
+pub fn json5_to_string(v: &Value, default: &Value, info: &Value) -> String {
+    let mut dst = String::new();
+    write_comment(0, &mut dst, info["#"].as_str());
+    json5_write(0, &mut dst, v, default, info);
+    dst
+}
diff --git a/llgtrt/src/lib.rs b/llgtrt/src/lib.rs
index f1d67e6..016c126 100644
--- a/llgtrt/src/lib.rs
+++ b/llgtrt/src/lib.rs
@@ -7,4 +7,5 @@ pub mod startup;
 pub mod state;
 mod async_exec;
 pub mod logging;
-mod constraint_mgr;
\ No newline at end of file
+mod constraint_mgr;
+pub mod jsonutil;
\ No newline at end of file
diff --git a/llgtrt/src/logging.rs b/llgtrt/src/logging.rs
index 55b2dd7..f5aa880 100644
--- a/llgtrt/src/logging.rs
+++ b/llgtrt/src/logging.rs
@@ -82,13 +82,13 @@ pub fn init_log(mode: LogMode) -> Result<()> {
     let logger = match mode {
         LogMode::Normal => Logger::try_with_env_or_str("info,tokenizers=error")?
             .format(truncated_format)
-            .log_to_stdout(),
+            .log_to_stderr(),
         LogMode::Test => {
             Logger::try_with_env_or_str("debug,tokenizers=error")?.write_mode(WriteMode::SupportCapture)
         }
         LogMode::Daemon => Logger::try_with_env_or_str("info,tokenizers=error")?
             .format(daemon_format)
-            .log_to_stdout(),
+            .log_to_stderr(),
     };
 
     logger.start()?;
diff --git a/llgtrt/src/main.rs b/llgtrt/src/main.rs
index bc5a59c..03bf7f8 100644
--- a/llgtrt/src/main.rs
+++ b/llgtrt/src/main.rs
@@ -2,12 +2,12 @@ use std::env;
 
 use clap::Parser;
 
-use llgtrt::config::Config;
+use llgtrt::config::CliConfig;
 use llgtrt::startup;
 
 #[tokio::main]
 async fn main() -> anyhow::Result<()> {
-    let config = Config::parse();
+    let config = CliConfig::parse();
 
     if config.debug {
         env::set_var("RUST_LOG", "debug,tokenizers=error");
diff --git a/llgtrt/src/startup.rs b/llgtrt/src/startup.rs
index 39e9709..789536f 100644
--- a/llgtrt/src/startup.rs
+++ b/llgtrt/src/startup.rs
@@ -1,6 +1,6 @@
-use std::fmt::Debug;
 use std::sync::Arc;
 
+use anyhow::anyhow;
 use axum::body::Body;
 use axum::http::{Request, StatusCode};
 use axum::middleware::{self, Next};
@@ -10,10 +10,11 @@ use axum::Router;
 use trtllm_rs::{ClientReqId, ExecutorInit, RequestInit, RequestParams};
 
 use crate::async_exec::AsyncExecutor;
-use crate::config::{Config, TrtLlmRuntimeConfig};
+use crate::config::{config_info, CliConfig, LlgTrtConfig};
 use crate::constraint_mgr::ConstraintMgr;
-use crate::routes;
+use crate::jsonutil::json5_to_string;
 use crate::state::AppState;
+use crate::{jsonutil, routes};
 
 async fn auth_middleware(
     req: Request<Body>,
@@ -34,66 +35,97 @@ async fn auth_middleware(
     }
 }
 
-fn load_config_file<T>(
-    name0: &Option<String>,
-    default_name: String,
-    cli_args: serde_json::Value,
-) -> anyhow::Result<T>
-where
-    T: serde::de::DeserializeOwned + Debug,
-{
-    let name = name0.as_ref().unwrap_or(&default_name);
-    let mut json: serde_json::Value = if name0.is_some() || std::fs::exists(name).unwrap_or(false) {
-        log::info!("Loading config from {}", name);
-        let s = std::fs::read_to_string(name)?;
-        serde_json::from_str(&s)?
-    } else {
-        log::info!("Config file {} not found, using defaults", name);
-        serde_json::json!({})
+pub async fn run_server(mut cli_config: CliConfig) -> anyhow::Result<()> {
+    let mut exec_config = ExecutorInit {
+        engine_path: cli_config.engine.clone(),
+        logits_callback: None,
+        trt_params: Default::default(),
     };
 
-    for (k, v) in cli_args.as_object().unwrap() {
-        if v.is_null() {
-            continue;
+    let defl_config_path = format!("{}/llgtrt.json5", cli_config.engine);
+    if cli_config.config.is_empty() {
+        if std::fs::exists(&defl_config_path).unwrap_or(false) {
+            log::info!("Using default config file {}", defl_config_path);
+            cli_config.config.push(defl_config_path);
+        } else {
+            log::info!(
+                "No config files specified and default config file {} not found",
+                defl_config_path
+            );
         }
-        json.as_object_mut().unwrap().insert(k.clone(), v.clone());
     }
 
-    let r = serde_json::from_value(json)
-        .map_err(|e| anyhow::anyhow!("Error parsing config file {}: {}", name, e))?;
-    log::info!("Loaded config: {:?}", r);
-    Ok(r)
-}
+    let mut config = LlgTrtConfig::default();
 
-pub async fn run_server(cli_config: Config) -> anyhow::Result<()> {
-    let mut exec_config = ExecutorInit {
-        engine_path: cli_config.engine.clone(),
-        logits_callback: None,
-        trt_params: Default::default(),
-    };
+    if cli_config.save_config.is_some() {
+        log::info!("Skipping tokenizer config load");
+    } else {
+        let tokenizer_folder = cli_config.tokenizer.as_ref().unwrap_or(&cli_config.engine);
+        let tokenizer_config = format!("{}/tokenizer_config.json", tokenizer_folder);
+        log::info!("Loading tokenizer config from {:?}", tokenizer_config);
+        config.tokenizer = serde_json::from_reader(std::fs::File::open(tokenizer_config)?)
+            .map_err(|e| anyhow!("error loading tokenizer_config.json: {}", e))?;
+    }
 
-    let runtime_config: TrtLlmRuntimeConfig = load_config_file(
-        &cli_config.runtime_config,
-        format!("{}/runtime.json", cli_config.engine),
-        serde_json::to_value(&cli_config.runtime_config_inline)?,
-    )?;
+    let mut config = serde_json::to_value(&config)?;
+
+    for file_name in &cli_config.config {
+        log::info!("Loading JSON5 config from {:?}", file_name);
+        let file_content = std::fs::read_to_string(&file_name)
+            .map_err(|e| anyhow!("Error reading config file {}: {}", file_name, e))?;
+        let patch = json5::from_str::<serde_json::Value>(&file_content)
+            .map_err(|e| anyhow!("Error in JSON5 in {}: {}", file_name, e))?;
+        jsonutil::json_merge(&mut config, &patch);
+    }
+
+    let mut config: LlgTrtConfig =
+        serde_json::from_value(config).map_err(|e| anyhow!("Error interpreting config: {}", e))?;
+
+    if cli_config.save_config.is_some() {
+        log::info!("Skipping separate chat template load");
+    } else {
+        let chat_template = cli_config
+            .chat_template
+            .clone()
+            .unwrap_or_else(|| format!("{}/chat_template.j2", cli_config.engine));
+        log::info!("Checking for separate chat template in {:?}", chat_template);
+        if std::fs::exists(&chat_template)? {
+            config.tokenizer.chat_template = Some(std::fs::read_to_string(chat_template)?);
+        }
+    }
 
+    if let Some(filename) = cli_config
+        .save_config
+        .as_ref()
+        .or(cli_config.save_complete_config.as_ref())
+    {
+        let r = json5_to_string(
+            &serde_json::to_value(&config)?,
+            &serde_json::to_value(&LlgTrtConfig::default())?,
+            &config_info(),
+        );
+        if filename == "-" {
+            log::info!("Printing merged config to stdout");
+            println!("{}", r);
+        } else {
+            log::info!("Saving merged config to {}", filename);
+            std::fs::write(filename, r)?;
+        }
+        return Ok(());
+    }
+
+    let runtime_config = &config.runtime;
     let p = &mut exec_config.trt_params;
 
     macro_rules! set_field {
         ($fld:ident) => {
-            if let Some(v) = runtime_config.$fld {
-                p.$fld = v
-                    .try_into()
-                    .expect(concat!("Invalid value for ", stringify!($fld)));
-            }
+            p.$fld = runtime_config
+                .$fld
+                .try_into()
+                .expect(concat!("Invalid value for ", stringify!($fld)));
         };
     }
 
-    // we default these to true
-    p.enable_chunked_context = true;
-    p.enable_kv_cache_reuse = true;
-
     set_field!(enable_chunked_context);
     set_field!(enable_kv_cache_reuse);
     set_field!(max_batch_size);
@@ -101,24 +133,15 @@ pub async fn run_server(cli_config: Config) -> anyhow::Result<()> {
     set_field!(max_queue_size);
     set_field!(guaranteed_no_evict);
     set_field!(kv_cache_free_gpu_mem_fraction);
-
-    if let Some(v) = runtime_config.kv_cache_host_memory_megabytes {
-        p.kv_cache_host_memory_bytes = v * 1024 * 1024;
-    }
+    p.kv_cache_host_memory_bytes = runtime_config.kv_cache_host_memory_megabytes * 1024 * 1024;
 
     log::info!("Initializing executor with config: {:?}", exec_config);
 
-    let (executor, tok_env, chat_builder) = AsyncExecutor::new(&cli_config, exec_config)?;
+    let (executor, tok_env, chat_builder) = AsyncExecutor::new(&cli_config, &config, exec_config)?;
 
     // we only get here on rank 0
 
-    let llg_config: serde_json::Value = load_config_file(
-        &cli_config.llguidance_config,
-        format!("{}/llguidance.json", cli_config.engine),
-        serde_json::json!({}),
-    )?;
-
-    let constraint_mgr = ConstraintMgr::new(tok_env.clone(), tok_env.clone(), llg_config)?;
+    let constraint_mgr = ConstraintMgr::new(tok_env.clone(), tok_env.clone(), &config.llguidance)?;
 
     AsyncExecutor::set_global(executor);
 
diff --git a/llgtrt/src/tokenizer.rs b/llgtrt/src/tokenizer.rs
index fbe79c3..a8fd72f 100644
--- a/llgtrt/src/tokenizer.rs
+++ b/llgtrt/src/tokenizer.rs
@@ -1,15 +1,23 @@
-use crate::{chat::ChatBuilder, config::Config};
-use anyhow::{anyhow, ensure};
+use crate::{
+    chat::ChatBuilder,
+    config::{CliConfig, LlgTrtConfig},
+};
+use anyhow::ensure;
 use serde::{Deserialize, Serialize};
-use serde_json::Value;
-use std::{collections::HashMap, sync::Arc};
+use std::sync::Arc;
 use toktrie::{TokEnv, TokEnvWithTrie};
 
+const DEFAULT_TEMPLATE: &str = r#"{{- bos_token }}
+{%- for message in messages %}
+    {{- '<|' + message['role'] + |>\n' }}
+    {{- message['content'] + eos_token }}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|assistant|>\n' }}
+{%- endif %}"#;
+
 #[derive(Debug, Serialize, Deserialize)]
 pub struct TokenizerConfig {
-    #[serde(default)]
-    pub added_tokens_decoder: HashMap<String, TokenProperties>,
-
     pub chat_template: Option<String>,
 
     #[serde(default)]
@@ -24,50 +32,27 @@ pub struct TokenizerConfig {
     pub mask_token: Option<String>,
 }
 
-#[derive(Debug, Serialize, Deserialize)]
-pub struct TokenProperties {
-    pub content: String,
-    #[serde(default)]
-    pub lstrip: bool,
-    #[serde(default)]
-    pub normalized: bool,
-    #[serde(default)]
-    pub rstrip: bool,
-    #[serde(default)]
-    pub single_word: bool,
-    #[serde(default)]
-    pub special: bool,
-}
-
-pub fn setup_tokenizer(cli_config: &Config) -> anyhow::Result<(TokEnv, ChatBuilder)> {
-    let tokenizer_folder = cli_config.tokenizer.as_ref().unwrap_or(&cli_config.engine);
-    let tokenizer_config = format!("{}/tokenizer_config.json", tokenizer_folder);
-    log::info!("Loading tokenizer config from {:?}", tokenizer_config);
-    let mut tok_cfg: TokenizerConfig =
-        serde_json::from_reader(std::fs::File::open(tokenizer_config)?)
-            .map_err(|e| anyhow!("error loading tokenizer_config.json: {}", e))?;
-
-    let tokenizer_config_llg = format!("{}/tokenizer_config_llgtrt.json", tokenizer_folder);
-    log::info!("Checking for overrides in {:?}", tokenizer_config_llg);
-    if std::fs::exists(&tokenizer_config_llg)? {
-        let mut json = serde_json::to_value(&tok_cfg)?;
-        let overrides: Value = serde_json::from_reader(std::fs::File::open(tokenizer_config_llg)?)
-            .map_err(|e| anyhow!("JSON error in tokenizer_config_llgtrt.json: {}", e))?;
-        for (k, v) in overrides.as_object().expect("overrides must be an object") {
-            if v.is_null() {
-                continue;
-            }
-            json.as_object_mut().unwrap().insert(k.clone(), v.clone());
+impl Default for TokenizerConfig {
+    fn default() -> Self {
+        Self {
+            chat_template: Some(DEFAULT_TEMPLATE.to_string()),
+            clean_up_tokenization_spaces: false,
+            eos_token: "<default_eos_token>".to_string(),
+            bos_token: None,
+            unk_token: None,
+            sep_token: None,
+            pad_token: None,
+            cls_token: None,
+            mask_token: None,
         }
-        tok_cfg = serde_json::from_value(json)
-            .map_err(|e| anyhow!("error applying tokenizer_config_llgtrt.json: {}", e))?;
     }
+}
 
-    let chat_template = format!("{}/chat_template.j2", tokenizer_folder);
-    log::info!("Checking for separate chat template in {:?}", chat_template);
-    if std::fs::exists(&chat_template)? {
-        tok_cfg.chat_template = Some(std::fs::read_to_string(chat_template)?);
-    }
+pub fn setup_tokenizer(
+    cli_config: &CliConfig,
+    config: &LlgTrtConfig,
+) -> anyhow::Result<(TokEnv, ChatBuilder)> {
+    let tokenizer_folder = cli_config.tokenizer.as_ref().unwrap_or(&cli_config.engine);
 
     let tokenizer = format!("{}/tokenizer.json", tokenizer_folder);
     log::info!("Loading tokenizer from {:?}", tokenizer);
@@ -76,6 +61,7 @@ pub fn setup_tokenizer(cli_config: &Config) -> anyhow::Result<(TokEnv, ChatBuild
     let trie = tok_env.tok_trie();
     let mut info = trie.info().clone();
 
+    let tok_cfg = &config.tokenizer;
     let toks = tok_env.tokenize_special(&tok_cfg.eos_token);
     ensure!(
         toks.len() == 1,
diff --git a/llguidance b/llguidance
index 771cbf0..4b3645c 160000
--- a/llguidance
+++ b/llguidance
@@ -1 +1 @@
-Subproject commit 771cbf046f0cd0262ec4e878d584df9383d9466f
+Subproject commit 4b3645c71bead19f24a22d4534a06b2bd39aa418
diff --git a/scripts/collect-comments.py b/scripts/collect-comments.py
new file mode 100644
index 0000000..06fc793
--- /dev/null
+++ b/scripts/collect-comments.py
@@ -0,0 +1,99 @@
+import re
+import os
+import json
+import sys
+
+# Regular expressions for capturing struct definitions and their fields with comments
+struct_regex = re.compile(r"pub struct (\w+) \{(.*?)^\s*\}", re.DOTALL | re.MULTILINE)
+field_regex = re.compile(r"((?:\s*///\s*.*?\n)+)\s*(pub\s+)?(\w+):\s+([\w:<>]+),", re.DOTALL)
+
+def extract_structs_from_rust_file(file_content):
+    structs = {}
+    
+    # Iterate over each struct in the file
+    for struct_match in struct_regex.finditer(file_content):
+        struct_name = struct_match.group(1)
+        struct_body = struct_match.group(2)
+        
+        fields = {}
+        
+        # Iterate over each field in the struct body
+        for field_match in field_regex.finditer(struct_body):
+            raw_comment = field_match.group(1).strip()
+            field_name = field_match.group(3).strip()
+            field_type = field_match.group(4).strip()
+            
+            # Join multiple lines of `///` comments with newline
+            comment = "\n".join([line.strip()[3:].strip() for line in raw_comment.splitlines()])
+            
+            fields[field_name] = {"#": comment, "type": field_type}
+        
+        structs[struct_name] = fields
+    return structs
+
+def resolve_struct_recursive(struct_name, structs_metadata):
+    if struct_name not in structs_metadata:
+        return {}
+
+    resolved_fields = {}
+    fields = structs_metadata[struct_name]
+
+    for field_name, field_metadata in fields.items():
+        comment = field_metadata["#"]
+        field_type = field_metadata["type"]
+
+        # Check if field type matches another struct, if so, recurse
+        if field_type in structs_metadata:
+            resolved_fields[field_name] = {
+                "#": comment,
+                **resolve_struct_recursive(field_type, structs_metadata)
+            }
+        else:
+            resolved_fields[field_name] = {"#": comment}
+
+    return resolved_fields
+
+def process_rust_files(file_list):
+    structs_metadata = {}
+    for file_path in file_list:
+        if os.path.exists(file_path) and file_path.endswith(".rs"):
+            with open(file_path, 'r', encoding='utf-8') as f:
+                file_content = f.read()
+                structs_in_file = extract_structs_from_rust_file(file_content)
+                structs_metadata.update(structs_in_file)
+        else:
+            print(f"Warning: {file_path} not found or not a Rust file.")
+    return structs_metadata
+
+def main():
+    # Get the list of Rust files from command-line arguments
+    if len(sys.argv) < 3:
+        print("Usage: python script.py <root_struct_name> <file1.rs> <file2.rs> ...")
+        sys.exit(1)
+
+    root_struct_name = sys.argv[1]
+    rust_files = sys.argv[2:]
+    
+    # Process the Rust files and extract struct metadata
+    metadata = process_rust_files(rust_files)
+
+    if root_struct_name not in metadata:
+        print(f"Error: Struct '{root_struct_name}' not found in the provided files.")
+        sys.exit(1)
+
+    # Start the recursive resolution from the root struct
+    resolved_metadata = resolve_struct_recursive(root_struct_name, metadata)
+    resolved_metadata = {
+        "##info##": "Use scripts/regen.sh to re-generate this file",
+        **resolved_metadata
+    }
+    
+    # Output the metadata as JSON
+    output_file = "llgtrt/src/config_info.json"
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump(resolved_metadata, f, indent=1)
+
+    print(f"Metadata written to {output_file}")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/scripts/regen.sh b/scripts/regen.sh
new file mode 100755
index 0000000..531dfb3
--- /dev/null
+++ b/scripts/regen.sh
@@ -0,0 +1,4 @@
+#!/bin/sh
+
+cd "$(dirname "$0")/.."
+python scripts/collect-comments.py LlgTrtConfig llgtrt/src/*.rs llguidance/parser/src/*.rs