reworking how to pass config files

guidance-ai · Oct 29, 2024 · 9fd8000 · 9fd8000
1 parent bf1c511
commit 9fd8000
Show file tree

Hide file tree

Showing 20 changed files with 745 additions and 214 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,4 @@
 build
 tmp
-.vscode/settings.json
 target
 model.cache
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,15 @@
+{
+    "C_Cpp.autoAddFileAssociations": false,
+    "files.readonlyInclude": {
+        "**/config_info.json": true
+    },
+    "cSpell.words": [
+        "ckpt",
+        "fmha",
+        "llgtrt",
+        "mpirun",
+        "npuichigo",
+        "openai",
+        "trtllm"
+    ]
+}
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/README.md b/README.md
@@ -4,15 +4,15 @@ This project demonstrates how to use
 [llguidance library](https://github.com/microsoft/llguidance)
 for constrained output with
 [NVIDIA TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM),
-implementing a server with 
-[OpenAI REST API](https://platform.openai.com/docs/api-reference/introduction).
+implementing a REST server compatible with 
+[OpenAI APIs](https://platform.openai.com/docs/api-reference/introduction).
 
 The server supports regular completions and chat endpoints
-with JSON with schema enforcement ("Structured Output" in OpenAI docs),
+with JSON with schema enforcement ("Structured Output"),
 as well as full context-free grammars using [Guidance library](https://github.com/guidance-ai/guidance).
 
 This server is similar in spirit to [TensorRT-LLM OpenAI server example](./TensorRT-LLM/examples/apps/openai_server.py),
-but python-free and with support for constrained output.
+but is Python-free (implemented in Rust) and with support for constrained output.
 Similarly to the example above, it **does not** use the NVIDIA Triton Inference Server.
 
 ## Requirements
@@ -89,17 +89,6 @@ HF Transformers `config.json` as well as the `.safetensors` files and
 `tokenizer.json`).
 If you're running on more than one 1 GPU, modify the `--tp_size` argument.
 
-### Create config files
-
-By default, llgtrt will use chat template from `tokenizer_config.json`.
-If present, it will also read `tokenizer_config_llgtrt.json` from the same directory
-and apply any keys from it to `tokenizer_config.json`.
-Afterwards, if `chat_template.j2` file is found, it will be used as the chat template.
-
-You can also modify TensortRT-LLM's runtime configuration with `runtime.json` file
-and `llguidance_parser` configuration with `llguidance.json`.
-This is optional, see below.
-
 ### Running the Engine
 
 ```bash
@@ -110,8 +99,25 @@ The command will print out the actual `docker run` invocation on first line
 if you want to invoke it directly later.
 `PORT` defaults to 3000.
 
+### Update configuration
+
 You can pass additional arguments after the engine path.
 Try running `./docker/run.sh /path/to/hf-models/model-engine --help` for more info.
+Most of the options are specified in configuration files,
+but which configuration files are used can be modified with command line arguments.
+
+By default, llgtrt will use chat template from `tokenizer_config.json`.
+
+If present, it will also read `tokenizer_config_llgtrt.json` from the same directory
+and apply any keys from it to `tokenizer_config.json`.
+Afterwards, if `chat_template.j2` file is found, it will be used as the chat template.
+
+You can also modify TensortRT-LLM's runtime configuration with `runtime.json` file
+and `llguidance_parser` configuration with `llguidance.json`.
+This is optional, see below.
+
+
+
 The `--help` has up-to-date info on `runtime.json` file -
 the options can be specified either in these files (replace `-` with `_`)
 or on command line.

diff --git a/llgtrt/Cargo.toml b/llgtrt/Cargo.toml
@@ -23,3 +23,4 @@ rayon = "1.10.0"
 futures-core = "0.3.30"
 minijinja = { version = "2.3.1", features = ["preserve_order", "loop_controls", "loader"] }
 chrono = "0.4.38"
+json5 = "0.4.1"
diff --git a/llgtrt/chat_templates/llama31.j2 b/llgtrt/chat_templates/llama31.j2
@@ -0,0 +1,92 @@
+{#- This is adapted from huggingface tokenizer_config.json/chat_template but updated to match #}
+{#- https://github.com/meta-llama/llama-models/blob/main/models/llama3_1/prompt_format.md #}
+{{- bos_token }}
+{%- if custom_tools is defined %}
+    {%- set tools = custom_tools %}
+{%- endif %}
+{%- if not date_string is defined %}
+    {%- set date_string = "21 September 2024" %}
+{%- endif %}
+{%- if not tools is defined %}
+    {%- set tools = none %}
+{%- endif %}
+
+{#- This block extracts the system message, so we can slot it into the right place. #}
+{%- if messages[0]['role'] == 'system' %}
+    {%- set system_message = messages[0]['content']|trim %}
+    {%- set messages = messages[1:] %}
+{%- else %}
+    {%- set system_message = "" %}
+{%- endif %}
+
+{#- System message + builtin tools #}
+{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
+{%- if builtin_tools is defined or tools is not none %}
+    {{- "Environment: ipython\n" }}
+{%- endif %}
+{%- if builtin_tools is defined %}
+    {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}}
+{%- endif %}
+{{- "Cutting Knowledge Date: December 2023\n" }}
+{{- "Today Date: " + date_string + "\n\n" }}
+{{- system_message }}
+{{- "<|eot_id|>" }}
+
+{#- Custom tools are passed in a user message with some extra guidance #}
+{%- if not tools is none %}
+    {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}
+    {{- "Answer the user's question by making use of the following functions if needed.\n" }}
+    {{- "If none of the function can be used, please say so.\n" }}
+    {{- "Here is a list of functions in JSON format:\n" }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+    {{- "<|eot_id|>"}}
+{%- endif %}
+
+{%- for message in messages %}
+    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
+        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}
+    {%- elif 'tool_calls' in message %}
+        {%- if not message.tool_calls|length == 1 %}
+            {{- raise_exception("This model only supports single tool-calls at once!") }}
+        {%- endif %}
+        {%- set tool_call = message.tool_calls[0].function %}
+        {%- if builtin_tools is defined and tool_call.name in builtin_tools %}
+            {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
+            {{- "<|python_tag|>" + tool_call.name + ".call(" }}
+            {%- for arg_name, arg_val in tool_call.arguments | items %}
+                {{- arg_name + '="' + arg_val + '"' }}
+                {%- if not loop.last %}
+                    {{- ", " }}
+                {%- endif %}
+                {%- endfor %}
+            {{- ")" }}
+        {%- else  %}
+            {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
+            {{- '<|python_tag|>{"name": "' + tool_call.name + '", ' }}
+            {{- '"parameters": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- "}" }}
+        {%- endif %}
+        {%- if builtin_tools is defined %}
+            {#- This means we're in ipython mode #}
+            {{- "<|eom_id|>" }}
+        {%- else %}
+            {{- "<|eot_id|>" }}
+        {%- endif %}
+    {%- elif message.role == "tool" or message.role == "ipython" %}
+        {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
+        {%- if message.content is mapping or message.content is iterable %}
+            {{- message.content | tojson }}
+        {%- else %}
+            {{- message.content }}
+        {%- endif %}
+        {{- "<|eot_id|>" }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
+{%- endif %}
+