From 727d88e9a94e2e51369d1a429583b82c46c20401 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Sat, 26 Oct 2024 04:42:14 -0700
Subject: [PATCH] Update 2024-10-26 04:42:14

---
 README.html                  | 4 ++--
 _sources/troubleshooting.md  | 4 ++--
 backend.html                 | 4 ++--
 benchmark_and_profiling.html | 4 ++--
 choices_methods.html         | 4 ++--
 contributor_guide.html       | 4 ++--
 custom_chat_template.html    | 4 ++--
 embedding_model.html         | 4 ++--
 frontend.html                | 4 ++--
 hyperparameter_tuning.html   | 4 ++--
 index.html                   | 4 ++--
 install.html                 | 4 ++--
 model_support.html           | 4 ++--
 release_process.html         | 4 ++--
 sampling_params.html         | 4 ++--
 searchindex.js               | 2 +-
 send_request.html            | 4 ++--
 setup_github_runner.html     | 4 ++--
 troubleshooting.html         | 8 ++++----
 19 files changed, 39 insertions(+), 39 deletions(-)
diff --git a/README.html b/README.html
index 236f6e5..b966fde 100644
--- a/README.html
+++ b/README.html
@@ -253,7 +253,7 @@
       
       
       
-      <li><a href="https://github.com/sgl-project/sglang/blob/main/docs/en/README.md?plain=1" target="_blank"
+      <li><a href="https://github.com/sgl-project/sglang/blob/main/README.md?plain=1" target="_blank"
    class="btn btn-sm btn-source-file-button dropdown-item"
    title="Show source"
    data-bs-placement="left" data-bs-toggle="tooltip"
@@ -270,7 +270,7 @@
       
       
       
-      <li><a href="https://github.com/sgl-project/sglang/edit/main/docs/en/README.md" target="_blank"
+      <li><a href="https://github.com/sgl-project/sglang/edit/main/README.md" target="_blank"
    class="btn btn-sm btn-source-edit-button dropdown-item"
    title="Suggest edit"
    data-bs-placement="left" data-bs-toggle="tooltip"
diff --git a/_sources/troubleshooting.md b/_sources/troubleshooting.md
index c6c016f..02793c9 100644
--- a/_sources/troubleshooting.md
+++ b/_sources/troubleshooting.md
@@ -5,9 +5,9 @@ This page lists some common errors and tips for fixing them.
 ## CUDA error: an illegal memory access was encountered
 This error may be due to kernel errors or out-of-memory issues.
 - If it is a kernel error, it is not easy to fix.
-- If it is out-of-memory, sometimes it will report this error instead of "Out-of-memory." In this case, try setting a smaller value for `--mem-fraction-static`. The default value of `--mem-fraction-static` is around 0.8 - 0.9. https://github.com/sgl-project/sglang/blob/1edd4e07d6ad52f4f63e7f6beaa5987c1e1cf621/python/sglang/srt/server_args.py#L92-L102
+- If it is out-of-memory, sometimes it will report this error instead of "Out-of-memory." In this case, try setting a smaller value for `--mem-fraction-static`. The default value of `--mem-fraction-static` is around 0.8 - 0.9.
 
 ## The server hangs
 If the server hangs, try disabling some optimizations when launching the server.
 - Add `--disable-cuda-graph`.
-- Add `--disable-flashinfer-sampling`.
+- Add `--sampling-backend pytorch`.
diff --git a/backend.html b/backend.html
index d59cc1e..c3632ce 100644
--- a/backend.html
+++ b/backend.html
@@ -253,7 +253,7 @@
       
       
       
-      <li><a href="https://github.com/sgl-project/sglang/blob/main/docs/en/backend.md?plain=1" target="_blank"
+      <li><a href="https://github.com/sgl-project/sglang/blob/main/backend.md?plain=1" target="_blank"
    class="btn btn-sm btn-source-file-button dropdown-item"
    title="Show source"
    data-bs-placement="left" data-bs-toggle="tooltip"
@@ -270,7 +270,7 @@
       
       
       
-      <li><a href="https://github.com/sgl-project/sglang/edit/main/docs/en/backend.md" target="_blank"
+      <li><a href="https://github.com/sgl-project/sglang/edit/main/backend.md" target="_blank"
    class="btn btn-sm btn-source-edit-button dropdown-item"
    title="Suggest edit"
    data-bs-placement="left" data-bs-toggle="tooltip"
diff --git a/benchmark_and_profiling.html b/benchmark_and_profiling.html
index 24c1e45..f8d84b9 100644
--- a/benchmark_and_profiling.html
+++ b/benchmark_and_profiling.html
@@ -253,7 +253,7 @@
       
       
       
-      <li><a href="https://github.com/sgl-project/sglang/blob/main/docs/en/benchmark_and_profiling.md?plain=1" target="_blank"
+      <li><a href="https://github.com/sgl-project/sglang/blob/main/benchmark_and_profiling.md?plain=1" target="_blank"
    class="btn btn-sm btn-source-file-button dropdown-item"
    title="Show source"
    data-bs-placement="left" data-bs-toggle="tooltip"
@@ -270,7 +270,7 @@
       
       
       
-      <li><a href="https://github.com/sgl-project/sglang/edit/main/docs/en/benchmark_and_profiling.md" target="_blank"
+      <li><a href="https://github.com/sgl-project/sglang/edit/main/benchmark_and_profiling.md" target="_blank"
    class="btn btn-sm btn-source-edit-button dropdown-item"
    title="Suggest edit"
    data-bs-placement="left" data-bs-toggle="tooltip"
diff --git a/choices_methods.html b/choices_methods.html
index cd57bb9..5905994 100644
--- a/choices_methods.html
+++ b/choices_methods.html
@@ -253,7 +253,7 @@
       
       
       
-      <li><a href="https://github.com/sgl-project/sglang/blob/main/docs/en/choices_methods.md?plain=1" target="_blank"
+      <li><a href="https://github.com/sgl-project/sglang/blob/main/choices_methods.md?plain=1" target="_blank"
    class="btn btn-sm btn-source-file-button dropdown-item"
    title="Show source"
    data-bs-placement="left" data-bs-toggle="tooltip"
@@ -270,7 +270,7 @@
       
       
       
-      <li><a href="https://github.com/sgl-project/sglang/edit/main/docs/en/choices_methods.md" target="_blank"
+      <li><a href="https://github.com/sgl-project/sglang/edit/main/choices_methods.md" target="_blank"
    class="btn btn-sm btn-source-edit-button dropdown-item"
    title="Suggest edit"
    data-bs-placement="left" data-bs-toggle="tooltip"
diff --git a/contributor_guide.html b/contributor_guide.html
index 7892915..aee83c9 100644
--- a/contributor_guide.html
+++ b/contributor_guide.html
@@ -253,7 +253,7 @@
       
       
       
-      <li><a href="https://github.com/sgl-project/sglang/blob/main/docs/en/contributor_guide.md?plain=1" target="_blank"
+      <li><a href="https://github.com/sgl-project/sglang/blob/main/contributor_guide.md?plain=1" target="_blank"
    class="btn btn-sm btn-source-file-button dropdown-item"
    title="Show source"
    data-bs-placement="left" data-bs-toggle="tooltip"
@@ -270,7 +270,7 @@
       
       
       
-      <li><a href="https://github.com/sgl-project/sglang/edit/main/docs/en/contributor_guide.md" target="_blank"
+      <li><a href="https://github.com/sgl-project/sglang/edit/main/contributor_guide.md" target="_blank"
    class="btn btn-sm btn-source-edit-button dropdown-item"
    title="Suggest edit"
    data-bs-placement="left" data-bs-toggle="tooltip"
diff --git a/custom_chat_template.html b/custom_chat_template.html
index b5d7419..874ad15 100644
--- a/custom_chat_template.html
+++ b/custom_chat_template.html
@@ -253,7 +253,7 @@
       
       
       
-      <li><a href="https://github.com/sgl-project/sglang/blob/main/docs/en/custom_chat_template.md?plain=1" target="_blank"
+      <li><a href="https://github.com/sgl-project/sglang/blob/main/custom_chat_template.md?plain=1" target="_blank"
    class="btn btn-sm btn-source-file-button dropdown-item"
    title="Show source"
    data-bs-placement="left" data-bs-toggle="tooltip"
@@ -270,7 +270,7 @@
       
       
       
-      <li><a href="https://github.com/sgl-project/sglang/edit/main/docs/en/custom_chat_template.md" target="_blank"
+      <li><a href="https://github.com/sgl-project/sglang/edit/main/custom_chat_template.md" target="_blank"
    class="btn btn-sm btn-source-edit-button dropdown-item"
    title="Suggest edit"
    data-bs-placement="left" data-bs-toggle="tooltip"
diff --git a/embedding_model.html b/embedding_model.html
index e1bfa24..b6c54c8 100644
--- a/embedding_model.html
+++ b/embedding_model.html
@@ -256,7 +256,7 @@
       
       
       
-      <li><a href="https://github.com/sgl-project/sglang/blob/main/docs/en/embedding_model.ipynb?plain=1" target="_blank"
+      <li><a href="https://github.com/sgl-project/sglang/blob/main/embedding_model.ipynb?plain=1" target="_blank"
    class="btn btn-sm btn-source-file-button dropdown-item"
    title="Show source"
    data-bs-placement="left" data-bs-toggle="tooltip"
@@ -273,7 +273,7 @@
       
       
       
-      <li><a href="https://github.com/sgl-project/sglang/edit/main/docs/en/embedding_model.ipynb" target="_blank"
+      <li><a href="https://github.com/sgl-project/sglang/edit/main/embedding_model.ipynb" target="_blank"
    class="btn btn-sm btn-source-edit-button dropdown-item"
    title="Suggest edit"
    data-bs-placement="left" data-bs-toggle="tooltip"
diff --git a/frontend.html b/frontend.html
index b078287..2f7adf8 100644
--- a/frontend.html
+++ b/frontend.html
@@ -253,7 +253,7 @@
       
       
       
-      <li><a href="https://github.com/sgl-project/sglang/blob/main/docs/en/frontend.md?plain=1" target="_blank"
+      <li><a href="https://github.com/sgl-project/sglang/blob/main/frontend.md?plain=1" target="_blank"
    class="btn btn-sm btn-source-file-button dropdown-item"
    title="Show source"
    data-bs-placement="left" data-bs-toggle="tooltip"
@@ -270,7 +270,7 @@
       
       
       
-      <li><a href="https://github.com/sgl-project/sglang/edit/main/docs/en/frontend.md" target="_blank"
+      <li><a href="https://github.com/sgl-project/sglang/edit/main/frontend.md" target="_blank"
    class="btn btn-sm btn-source-edit-button dropdown-item"
    title="Suggest edit"
    data-bs-placement="left" data-bs-toggle="tooltip"
diff --git a/hyperparameter_tuning.html b/hyperparameter_tuning.html
index bdf5f1e..985f048 100644
--- a/hyperparameter_tuning.html
+++ b/hyperparameter_tuning.html
@@ -253,7 +253,7 @@
       
       
       
-      <li><a href="https://github.com/sgl-project/sglang/blob/main/docs/en/hyperparameter_tuning.md?plain=1" target="_blank"
+      <li><a href="https://github.com/sgl-project/sglang/blob/main/hyperparameter_tuning.md?plain=1" target="_blank"
    class="btn btn-sm btn-source-file-button dropdown-item"
    title="Show source"
    data-bs-placement="left" data-bs-toggle="tooltip"
@@ -270,7 +270,7 @@
       
       
       
-      <li><a href="https://github.com/sgl-project/sglang/edit/main/docs/en/hyperparameter_tuning.md" target="_blank"
+      <li><a href="https://github.com/sgl-project/sglang/edit/main/hyperparameter_tuning.md" target="_blank"
    class="btn btn-sm btn-source-edit-button dropdown-item"
    title="Suggest edit"
    data-bs-placement="left" data-bs-toggle="tooltip"
diff --git a/index.html b/index.html
index 4faf81b..f853e31 100644
--- a/index.html
+++ b/index.html
@@ -254,7 +254,7 @@
       
       
       
-      <li><a href="https://github.com/sgl-project/sglang/blob/main/docs/en/index.rst?plain=1" target="_blank"
+      <li><a href="https://github.com/sgl-project/sglang/blob/main/index.rst?plain=1" target="_blank"
    class="btn btn-sm btn-source-file-button dropdown-item"
    title="Show source"
    data-bs-placement="left" data-bs-toggle="tooltip"
@@ -271,7 +271,7 @@
       
       
       
-      <li><a href="https://github.com/sgl-project/sglang/edit/main/docs/en/index.rst" target="_blank"
+      <li><a href="https://github.com/sgl-project/sglang/edit/main/index.rst" target="_blank"
    class="btn btn-sm btn-source-edit-button dropdown-item"
    title="Suggest edit"
    data-bs-placement="left" data-bs-toggle="tooltip"
diff --git a/install.html b/install.html
index 8287ac4..ca33594 100644
--- a/install.html
+++ b/install.html
@@ -253,7 +253,7 @@
       
       
       
-      <li><a href="https://github.com/sgl-project/sglang/blob/main/docs/en/install.md?plain=1" target="_blank"
+      <li><a href="https://github.com/sgl-project/sglang/blob/main/install.md?plain=1" target="_blank"
    class="btn btn-sm btn-source-file-button dropdown-item"
    title="Show source"
    data-bs-placement="left" data-bs-toggle="tooltip"
@@ -270,7 +270,7 @@
       
       
       
-      <li><a href="https://github.com/sgl-project/sglang/edit/main/docs/en/install.md" target="_blank"
+      <li><a href="https://github.com/sgl-project/sglang/edit/main/install.md" target="_blank"
    class="btn btn-sm btn-source-edit-button dropdown-item"
    title="Suggest edit"
    data-bs-placement="left" data-bs-toggle="tooltip"
diff --git a/model_support.html b/model_support.html
index d78d686..e2c06a4 100644
--- a/model_support.html
+++ b/model_support.html
@@ -253,7 +253,7 @@
       
       
       
-      <li><a href="https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md?plain=1" target="_blank"
+      <li><a href="https://github.com/sgl-project/sglang/blob/main/model_support.md?plain=1" target="_blank"
    class="btn btn-sm btn-source-file-button dropdown-item"
    title="Show source"
    data-bs-placement="left" data-bs-toggle="tooltip"
@@ -270,7 +270,7 @@
       
       
       
-      <li><a href="https://github.com/sgl-project/sglang/edit/main/docs/en/model_support.md" target="_blank"
+      <li><a href="https://github.com/sgl-project/sglang/edit/main/model_support.md" target="_blank"
    class="btn btn-sm btn-source-edit-button dropdown-item"
    title="Suggest edit"
    data-bs-placement="left" data-bs-toggle="tooltip"
diff --git a/release_process.html b/release_process.html
index 0ce1902..fbda60e 100644
--- a/release_process.html
+++ b/release_process.html
@@ -253,7 +253,7 @@
       
       
       
-      <li><a href="https://github.com/sgl-project/sglang/blob/main/docs/en/release_process.md?plain=1" target="_blank"
+      <li><a href="https://github.com/sgl-project/sglang/blob/main/release_process.md?plain=1" target="_blank"
    class="btn btn-sm btn-source-file-button dropdown-item"
    title="Show source"
    data-bs-placement="left" data-bs-toggle="tooltip"
@@ -270,7 +270,7 @@
       
       
       
-      <li><a href="https://github.com/sgl-project/sglang/edit/main/docs/en/release_process.md" target="_blank"
+      <li><a href="https://github.com/sgl-project/sglang/edit/main/release_process.md" target="_blank"
    class="btn btn-sm btn-source-edit-button dropdown-item"
    title="Suggest edit"
    data-bs-placement="left" data-bs-toggle="tooltip"
diff --git a/sampling_params.html b/sampling_params.html
index 8e5a568..bd16c5c 100644
--- a/sampling_params.html
+++ b/sampling_params.html
@@ -253,7 +253,7 @@
       
       
       
-      <li><a href="https://github.com/sgl-project/sglang/blob/main/docs/en/sampling_params.md?plain=1" target="_blank"
+      <li><a href="https://github.com/sgl-project/sglang/blob/main/sampling_params.md?plain=1" target="_blank"
    class="btn btn-sm btn-source-file-button dropdown-item"
    title="Show source"
    data-bs-placement="left" data-bs-toggle="tooltip"
@@ -270,7 +270,7 @@
       
       
       
-      <li><a href="https://github.com/sgl-project/sglang/edit/main/docs/en/sampling_params.md" target="_blank"
+      <li><a href="https://github.com/sgl-project/sglang/edit/main/sampling_params.md" target="_blank"
    class="btn btn-sm btn-source-edit-button dropdown-item"
    title="Suggest edit"
    data-bs-placement="left" data-bs-toggle="tooltip"
diff --git a/searchindex.js b/searchindex.js
index 454de5d..9aca4ea 100644
--- a/searchindex.js
+++ b/searchindex.js
@@ -1 +1 @@
-Search.setIndex({"alltitles": {"(Minor) Tune --schedule-policy": [[8, "minor-tune-schedule-policy"]], "Achieving Peak Throughput": [[8, "achieving-peak-throughput"]], "Add Unit Tests": [[4, "add-unit-tests"]], "Add a Runner": [[15, "add-a-runner"]], "Add the model to the test suite": [[11, "add-the-model-to-the-test-suite"]], "Additional Server Arguments": [[1, "additional-server-arguments"]], "All Together": [[13, "all-together"]], "Avoid out-of-memory by tuning --chunked-prefill-size, --mem-fraction-static, --max-running-requests": [[8, "avoid-out-of-memory-by-tuning-chunked-prefill-size-mem-fraction-static-max-running-requests"]], "Backend Tutorial": [[9, null]], "Backend: SGLang Runtime (SRT)": [[1, null]], "Baseline": [[13, "baseline"]], "Batching": [[7, "batching"]], "Benchmark": [[2, "benchmark"]], "Benchmark Performance": [[1, "benchmark-performance"]], "Benchmark and Profiling": [[2, null]], "Benchmarks": [[13, "benchmarks"]], "Build": [[0, "build"]], "Build the documentation website": [[0, "build-the-documentation-website"]], "CUDA error: an illegal memory access was encountered": [[16, "cuda-error-an-illegal-memory-access-was-encountered"]], "Choices Methods in SGLang": [[3, null]], "Clean": [[0, "clean"]], "Common Notes": [[10, "common-notes"]], "Constrained Decoding": [[7, "constrained-decoding"]], "Contributor Guide": [[4, null]], "Control Flow": [[7, "control-flow"]], "Custom Chat Template in SGLang Runtime": [[5, null]], "Dependency": [[0, "dependency"]], "Deploy": [[0, "deploy"]], "Embedding Model": [[6, null]], "Engine Without HTTP Server": [[1, "engine-without-http-server"]], "Examples": [[13, "examples"]], "Format Your Code": [[4, "format-your-code"]], "Frequency Penalty": [[13, "frequency-penalty"]], "Frontend Tutorial": [[9, null]], "Frontend: Structured Generation Language (SGLang)": [[7, null]], "Getting Started": [[9, null]], "Greedy Token Selection": [[3, "greedy-token-selection"]], "Guide on Hyperparameter Tuning": [[8, null]], "How to Support a New Model": [[11, null]], "Install SGLang": [[10, null]], "Interactive debugging": [[11, "interactive-debugging"]], "JSON Decoding": [[7, "json-decoding"]], "Language Feature": [[7, "language-feature"]], "Latency": [[13, "latency"]], "Launch A Server": [[6, "Launch-A-Server"]], "Launch a server": [[14, "Launch-a-server"]], "Make a release in GitHub": [[12, "make-a-release-in-github"]], "Memory": [[13, "memory"]], "Method 1: With pip": [[10, "method-1-with-pip"]], "Method 2: From source": [[10, "method-2-from-source"]], "Method 3: Using docker": [[10, "method-3-using-docker"]], "Method 4: Using docker compose": [[10, "method-4-using-docker-compose"]], "Method 5: Run on Kubernetes or Clouds with SkyPilot": [[10, "method-5-run-on-kubernetes-or-clouds-with-skypilot"]], "Methods": [[3, "methods"]], "Min New Tokens": [[13, "min-new-tokens"]], "More Examples": [[7, "more-examples"]], "Multi modal": [[13, "multi-modal"]], "Multi-Modality": [[7, "multi-modality"]], "Normal": [[13, "normal"]], "OpenAI Compatible API": [[1, "openai-compatible-api"]], "Other tips": [[2, "other-tips"]], "Parallelism": [[7, "parallelism"]], "Performance Implications on Penalties": [[13, "performance-implications-on-penalties"]], "Port a model from vLLM to SGLang": [[11, "port-a-model-from-vllm-to-sglang"]], "Presence Penalty": [[13, "presence-penalty"]], "Profile with Nsight": [[2, "profile-with-nsight"]], "PyPI Package Release Process": [[12, null]], "Quick Start": [[1, "quick-start"], [7, "quick-start"], [14, null]], "References": [[9, null]], "Repetition Penalty": [[13, "repetition-penalty"]], "Roles": [[7, "roles"]], "Run Llama 3.1 405B": [[1, "run-llama-3-1-405b"]], "SGLang Documentation": [[0, null], [9, null]], "Sampling Parameters in SGLang Runtime": [[13, null]], "Send a Request": [[14, "Send-a-Request"]], "Serve (preview)": [[0, "serve-preview"]], "Set Up Self-hosted Runners for GitHub Action": [[15, null]], "Step 1: Start a docker container.": [[15, "step-1-start-a-docker-container"]], "Step 2: Configure the runner by config.sh": [[15, "step-2-configure-the-runner-by-config-sh"]], "Step 3: Run the runner by run.sh": [[15, "step-3-run-the-runner-by-run-sh"]], "Streaming": [[7, "streaming"], [13, "streaming"]], "Supported Models": [[1, "supported-models"]], "Test the correctness": [[11, "test-the-correctness"]], "The server hangs": [[16, "the-server-hangs"]], "Tips and Implementation Details": [[7, "tips-and-implementation-details"]], "Token Length Normalized": [[3, "token-length-normalized"]], "Troubleshooting": [[16, null]], "Try advanced options": [[8, "try-advanced-options"]], "Tune --dp-size and --tp-size": [[8, "tune-dp-size-and-tp-size"]], "Tune --schedule-conservativeness": [[8, "tune-schedule-conservativeness"]], "Tune Your Request Submission Speed": [[8, "tune-your-request-submission-speed"]], "Unconditional Likelihood Normalized": [[3, "unconditional-likelihood-normalized"]], "Update the version in code": [[12, "update-the-version-in-code"]], "Upload the PyPI package": [[12, "upload-the-pypi-package"]], "Use Curl": [[6, "Use-Curl"]], "Use Models From ModelScope": [[1, "use-models-from-modelscope"]], "Using Local Models": [[7, "using-local-models"]], "Using OpenAI Compatible API": [[6, "Using-OpenAI-Compatible-API"], [14, "Using-OpenAI-Compatible-API"]], "Using OpenAI Models": [[7, "using-openai-models"]]}, "docnames": ["README", "backend", "benchmark_and_profiling", "choices_methods", "contributor_guide", "custom_chat_template", "embedding_model", "frontend", "hyperparameter_tuning", "index", "install", "model_support", "release_process", "sampling_params", "send_request", "setup_github_runner", "troubleshooting"], "envversion": {"nbsphinx": 4, "sphinx": 63, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["README.md", "backend.md", "benchmark_and_profiling.md", "choices_methods.md", "contributor_guide.md", "custom_chat_template.md", "embedding_model.ipynb", "frontend.md", "hyperparameter_tuning.md", "index.rst", "install.md", "model_support.md", "release_process.md", "sampling_params.md", "send_request.ipynb", "setup_github_runner.md", "troubleshooting.md"], "indexentries": {}, "objects": {}, "objnames": {}, "objtypes": {}, "terms": {"": [3, 6, 7, 8, 10, 11, 13, 14], "0": [1, 6, 7, 8, 10, 13, 14, 15, 16], "0000": 8, "0006804466247558594": 6, "0006995201110839844": 6, "0013713836669921875": 6, "00209808349609375": 6, "003047943115234375": 6, "00603485107421875": 6, "0062103271484375": 6, "0066680908203125": 6, "00809478759765625": 6, "0083160400390625": 6, "0089874267578125": 6, "0090179443359375": 6, "01": [7, 8, 13], "01131439208984375": 6, "01238250732421875": 6, "01273345947265625": 6, "0135955810546875": 6, "0143890380859375": 6, "01552581787109375": 6, "0190582275390625": 6, "02": 13, "021759033203125": 6, "03": 13, "04": [13, 15], "05": 13, "06": 13, "08": 13, "0_rocm6": 15, "0_triton3": 15, "1": [2, 6, 7, 8, 11, 13, 14], "10": [1, 2, 6, 13], "100": 7, "101": 13, "103": 13, "104": 13, "10405": 13, "10666": 13, "107": 13, "10767": 13, "11": 13, "114": 13, "11586": 13, "117": 13, "11732": 13, "12": [10, 13, 15], "127": [1, 6, 14], "128": [1, 13], "128009": 14, "13": 13, "14226": 13, "1449c9c20d4448299431a57facc68d7a": 14, "16": [1, 7, 13], "16219": 13, "16740": 13, "16757c3dd6e14a6e9bafd1122f84e4c5": 14, "17": 13, "17125": 13, "17167": 13, "172": 1, "1729816891": 14, "1729816893": 14, "174": 13, "179": 13, "18": 13, "18895": 13, "189": 13, "191": 13, "195": 13, "19884": 13, "1edd4e07d6ad52f4f63e7f6beaa5987c1e1cf621": 16, "1st": 13, "2": [1, 5, 6, 7, 9, 13, 14], "200": [6, 14], "20000": 1, "2048": [2, 8], "205": 13, "20866": 13, "22095": 13, "22363": 13, "22603": 13, "233": [8, 13], "23892": 13, "24": 13, "25": 7, "256": [1, 2, 7, 13], "26": 13, "268": 13, "27": 13, "271": 13, "29": 13, "293": 13, "3": [2, 5, 6, 7, 8, 9, 13, 14], "30": 13, "3000": 13, "30000": [1, 5, 7, 10, 13, 14], "30010": 6, "308": 13, "31": 13, "317": 8, "32": [1, 2, 13], "320": 13, "34": 14, "35": 13, "36": 13, "37": 13, "370959": 8, "378633": 13, "379": 14, "38": 13, "39": [13, 14], "4": [1, 7, 14], "40": 13, "40881": 13, "409": 13, "4096": [1, 2, 8], "41": 13, "41888": 13, "426": 14, "433": 13, "43967": 13, "44": 13, "440": 13, "447": 13, "44926": 13, "45": 13, "453": 13, "45354": 13, "45445": 13, "455": 13, "4594": 8, "46": [13, 14], "46530": 13, "47": [13, 14], "47738": 13, "48302": 13, "4832": 13, "48960": 13, "49": 14, "49017": 13, "49263": 13, "5": [1, 7, 13, 14], "50": [8, 13], "500": 8, "50000": 1, "50302": 13, "5079": 13, "51": 13, "512": [2, 13], "52": 1, "5206": 13, "5255": 13, "52554": 13, "52825": 13, "52920": 13, "54": 13, "54497": 13, "55": 13, "56": 13, "5656": 13, "5727": 13, "57426": 13, "58": 13, "59": 13, "5b": 11, "6": [1, 15], "60": [2, 13], "6000": 2, "61": 13, "64": [1, 2, 13, 14], "64g": 15, "65": 13, "66": 13, "67": 13, "68": 13, "69": 13, "7": 1, "70": [2, 13], "71": 13, "72": 13, "72b": 1, "73": 13, "74": 13, "75": 13, "76": 13, "766008": 13, "774756": 13, "774955": 13, "775118": 13, "775210": 13, "775220": 13, "775651": 13, "78": 13, "79": 13, "7b": [1, 5, 6, 13], "7fa2af80": 2, "8": [1, 13, 16], "8000": 0, "81": 13, "82": 8, "83": 13, "84": 13, "8413": 13, "85": 13, "86": 13, "88": 13, "89": 13, "8b": [1, 2, 7, 10, 13, 14], "9": [1, 7, 8, 16], "90": 13, "91": 13, "93": 13, "94": 13, "95": [1, 13, 14], "96": 13, "97": 13, "98": 13, "9900": 13, "9998": 8, "A": [1, 2, 7, 8, 10], "By": [5, 13], "For": [1, 2, 3, 11, 13], "If": [1, 5, 8, 10, 13, 16], "In": [1, 7, 16], "It": [1, 3, 5, 7, 8, 9, 10, 13, 14], "NOT": 5, "On": 8, "The": [1, 2, 3, 7, 8, 9, 10, 11, 13, 15], "Then": [7, 15], "There": 5, "These": 13, "To": [0, 1, 2, 7, 8, 10, 11], "__init__": 12, "__main__": 1, "__name__": 1, "_build": 0, "a10": 10, "a100": 10, "abl": 11, "about": [1, 5, 7, 8], "abov": [2, 3, 10, 13], "acceler": [1, 8, 10], "accept": 13, "access": [0, 1, 10], "accord": [2, 7, 10], "accur": [1, 2], "across": 3, "activ": 9, "ad": 10, "add": [1, 2, 7, 8, 13, 16], "addit": [3, 7, 13], "addr": 1, "address": [1, 7], "adopt": 9, "adv": 2, "advanc": 9, "against": 3, "ai": [1, 10, 14], "alexa": 14, "algorithm": 13, "alibaba": [1, 6], "aliv": 7, "all": [0, 1, 3, 4, 7, 8, 10, 11, 15], "all_other_model": 11, "allow": [2, 10], "almost": [1, 8, 11], "also": [1, 5, 7, 8, 13], "altern": [3, 7], "alwai": [8, 14], "am": 14, "amd": 15, "amount": 14, "an": [0, 1, 3, 7, 9, 10, 13, 14, 15], "analyz": 14, "ani": [1, 7, 10, 13], "annot": 2, "anoth": [11, 14], "answer": [3, 7, 14], "answer_1": 7, "answer_2": 7, "anthrop": 7, "antidisestablishmentarian": 3, "api": [3, 5, 7, 10, 13], "api_kei": [1, 6, 14], "appear": 13, "appli": 13, "applic": [1, 6, 9, 14], "approach": 10, "apt": [2, 15], "ar": [1, 2, 3, 5, 6, 7, 8, 10, 11, 13, 14], "arch": 15, "architectur": 2, "arg": 3, "argument": [2, 7, 13], "around": 16, "articl": 14, "artifici": 14, "assign": 14, "assist": [1, 3, 5, 7, 13, 14], "assistant_begin": 7, "assistant_end": 7, "attain": 8, "attent": [9, 10, 11], "attract": [3, 7], "auror": 7, "australia": 14, "author": [6, 14], "automat": 13, "autoregress": 7, "autosc": 10, "avail": [1, 10], "averag": 3, "avoid": 10, "awq": 9, "b": 10, "back": 9, "backend": [2, 3, 10, 13], "bad": 3, "baichuan2": 1, "balanc": 7, "base": [3, 13], "base64": 13, "base_url": [1, 6, 14], "bash": [12, 15], "basic": 13, "batch": [1, 2, 8, 9, 13], "bearer": [6, 14], "becaus": [7, 8], "been": 14, "befor": [2, 13], "begin": 7, "being": [8, 14], "below": [7, 10, 13, 15], "bench_lat": [1, 2, 11], "bench_serv": [1, 2, 13], "benchmark": 9, "berlin": 3, "bespok": 3, "better": [1, 8, 10, 11], "between": [1, 13], "bia": 7, "bin": 15, "blob": [13, 16], "block": 7, "blogpost": 3, "blood": 7, "bodi": [7, 13], "bogart": 7, "bool": 13, "born": 7, "both": 8, "bottleneck": 8, "branch": 10, "bras\u00edlia": 14, "brazil": 14, "break": [6, 13, 14], "browser": 0, "build": [1, 10, 12], "built": 10, "c": [6, 10], "cach": [1, 2, 8, 9, 10, 15], "calcul": 7, "call": [3, 7, 9, 14], "can": [1, 2, 3, 4, 5, 7, 8, 10, 11, 13, 14, 15], "canberra": 14, "cannot": 13, "capabl": 14, "capit": [1, 3, 7, 13, 14], "case": [8, 16], "cd": [4, 10, 12], "chain": 9, "chang": [11, 15], "charact": 7, "character_gen": 7, "character_regex": 7, "characterist": 14, "chat": [1, 7, 13, 14], "chat_exampl": 7, "chat_templ": 5, "chatbot": 14, "chatcomplet": 14, "chatcompletionmessag": 14, "chatglm": 1, "chatml": [1, 5, 13], "check": [1, 10], "checkpoint": [1, 2], "choic": [7, 9, 14], "choices_method": 3, "chunk": [1, 9, 13], "ci": 4, "class": 13, "cli": 2, "client": [1, 2, 6, 14], "clone": [0, 10], "cluster": 10, "co": 9, "code": [2, 7, 14], "collect": 14, "color": 2, "com": [2, 10, 12, 13, 15, 16], "come": [8, 13], "command": [1, 2, 4, 10, 11, 14, 15], "commit": 4, "common": 16, "commun": 9, "compar": 11, "comparison": [3, 11], "compat": [5, 7, 13], "compil": [1, 8], "complet": [1, 7, 14], "completion_token": 14, "completionusag": 14, "complex": 7, "comprehend": 14, "comput": [2, 7, 8, 13], "conda": 10, "confid": 3, "config": [1, 2], "connect": [7, 10], "consid": [2, 13], "constrain": [8, 9, 13], "constraint": 7, "contain": 3, "content": [1, 6, 7, 14], "context": 14, "contextu": 14, "continu": [7, 9], "contribut": 5, "contributor": 9, "control": 9, "convers": [5, 14], "convert": 11, "copi": 10, "core": [7, 9], "correct": [2, 13], "could": 13, "countri": [1, 14], "coverag": 11, "cpu": 8, "creat": [1, 6, 11, 14], "critic": 2, "cu121": 10, "cuda": [1, 2, 10, 13, 14, 15], "cuda_visible_devic": 15, "curl": [1, 13, 14, 15], "currenli": [1, 8], "custom": 1, "d": [0, 1, 2, 6, 7, 10, 14], "data": [1, 6, 8, 13, 14], "dataclass": 13, "dataset": [2, 13, 14], "dbrx": 1, "dc9d06d886151707f97d0b78095df9de262fd3c9": 13, "deactiv": 10, "deadlock": 1, "death": 7, "deb": 2, "deceas": 7, "decod": [8, 9, 13], "decode_unicod": 13, "decor": 7, "decreas": 8, "deep": 14, "deepseek": [1, 9], "def": [1, 3, 7, 14], "default": [1, 3, 5, 8, 10, 13, 16], "defin": [5, 7], "delai": 2, "depend": 10, "deploi": 10, "deploy": 10, "describ": [3, 13], "descript": [2, 13, 14], "design": [9, 14], "desir": 13, "detail": 13, "detailed_tip": 7, "determin": 3, "detoken": 13, "dev": [1, 15], "devel": 15, "develop": 2, "devic": [1, 10, 15], "devnul": [6, 14], "devtool": 2, "dict": 13, "diet": 7, "differ": 11, "difficult": 13, "digest": 14, "directli": 1, "directori": 11, "disabl": [1, 2, 13, 16], "dislik": 13, "distrib_releas": 2, "dn": 7, "do": [2, 8, 13, 15], "doc": [2, 3, 10, 13], "doc_site_path": 0, "dockerfil": 10, "document": [5, 10], "doe": [1, 2, 8], "donald": 3, "done": [13, 15], "down": 3, "download": [2, 13], "dp": 1, "dpkg": 2, "drawback": 13, "dri": 15, "dtype": 1, "duck": 3, "due": [3, 8, 16], "dummi": 2, "durat": [2, 13], "dure": [1, 8, 13, 14], "dynam": [1, 2], "e": [2, 10, 11, 14, 15], "e2": 13, "e5": [1, 9], "each": 1, "earli": 8, "earlier": 3, "easi": [9, 11, 16], "easier": 7, "eater": 7, "echo": [2, 15], "edit": 15, "effici": [1, 9], "either": 13, "element": 6, "eleutherai": 3, "elif": 7, "els": 14, "embed": [1, 9], "embedding_process": 6, "empti": 1, "empty_cach": 14, "enabl": [1, 7, 8, 10, 14], "encod": 13, "encount": 10, "encourag": 13, "end": [7, 11, 13], "endpoint": [1, 10, 13], "engag": 14, "engin": 7, "enough": [1, 8], "entryclass": 11, "enumer": 7, "env": 10, "environ": [1, 15], "eo": [8, 13], "equival": [6, 14], "error": [1, 6, 8, 14], "etc": [2, 9], "eth0": 1, "even": [3, 14], "everi": 13, "exampl": [1, 3, 6, 11, 14, 15], "example_imag": 13, "exaon": 1, "except": [6, 14], "excl": 13, "exec": 2, "execut": [10, 14], "exercis": 7, "exist": 11, "expand": 7, "experiment": 8, "explicit": 14, "export": [0, 1, 7, 15], "express": [7, 13], "extend": 3, "extens": [9, 11], "extern": [7, 9], "extra": 13, "f": [1, 7, 14], "face": [1, 5], "fail": 3, "failur": 10, "fals": 13, "far": 13, "fast": 9, "faster": 9, "favor": 8, "fcf": 8, "featur": [1, 9], "feed": 14, "fetch": 2, "file": [0, 2, 4, 11, 13], "fill": 7, "fillmor": 3, "final": 14, "find": [7, 11, 13], "finish_reason": 14, "first": [1, 2, 6, 7, 8, 13], "fix": 16, "flashinf": [9, 10, 16], "flexibl": 9, "float": 13, "flow": 9, "flush": [7, 13], "folder": [2, 4, 15], "follow": [1, 2, 5, 7, 8, 11, 13, 15], "forev": 15, "fork": [2, 7], "format": [2, 7, 13], "forward": [9, 11], "forward_batch": 11, "found": 7, "fp16": 1, "fp8": [1, 8, 9], "fp8_e5m2": 1, "fraction": [1, 13, 16], "framework": 9, "franc": [1, 3, 7, 13], "frequency_penalti": 13, "frequent": 8, "from": [4, 5, 7, 14], "frontend": [5, 10], "full": [1, 8], "function": [3, 7, 11], "function_cal": 14, "further": 10, "futur": [1, 11], "g": [2, 10, 11, 15], "gc": 14, "gemini": 7, "gemma": [1, 9], "gen": [3, 7, 8], "gener": [0, 1, 9, 13, 14], "generatereqinput": 13, "get": [6, 10, 11, 13, 14], "getpgid": 14, "git": [10, 15], "github": [0, 10, 13, 16], "give": [11, 15], "given": 13, "glimps": 13, "gloo_socket_ifnam": 1, "gnupg": 2, "good": 8, "googl": [7, 14], "gpt": 7, "gptq": 9, "gpu": [1, 8, 10, 13, 15], "graph": [1, 2, 16], "greedy_token_select": 3, "grok": 1, "group": 15, "gryffindor": 7, "gte": [1, 6], "guid": [9, 10, 13], "h": [1, 6, 14], "h100": [10, 13], "ha": [11, 14], "haisgl": 15, "half": 7, "hand": 8, "handl": [1, 2, 13], "happen": 8, "hardwar": 13, "harri": 7, "have": [0, 1, 3, 8, 13, 14], "header": [6, 14], "healthi": [7, 8], "hello": 1, "help": [1, 7, 8, 11, 13, 14], "henryx": 15, "here": [1, 7, 14], "hf": 5, "hf_home": 15, "hf_token": [10, 15], "hf_xxx": 15, "high": [3, 8, 13], "higher": 13, "highest": [3, 7], "hit": 13, "host": [6, 10, 14], "hostnam": 1, "hous": 7, "how": [1, 3, 4, 6, 7, 9], "html": [0, 2], "http": [0, 2, 6, 7, 10, 12, 13, 14, 15, 16], "hub": 10, "hufflepuff": 7, "hug": [1, 5], "huggingfac": [10, 11, 15], "human": 14, "hyperparamet": [1, 9], "i": [1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 13, 14, 16], "id": [13, 14], "ident": 11, "ignor": 13, "ignore_eo": 13, "im_end": [5, 13], "im_start": [5, 13], "imag": [7, 10, 13], "image_data": 13, "image_fil": 7, "image_id": 10, "image_qa": 7, "implement": [3, 11], "import": [1, 2, 6, 7, 8, 13, 14], "improv": 14, "includ": [7, 9, 14], "incorrect": 3, "increas": 8, "incur": 3, "independ": 10, "index": [2, 14], "indic": 8, "industri": 9, "inf": 13, "infer": [1, 13], "inform": [7, 13], "infra": 10, "init": 1, "initi": [3, 14], "input": [1, 2, 6, 7, 9, 13, 14], "input_id": 13, "insid": 15, "instal": [0, 2, 4, 9, 12, 15], "installationguid": 2, "instanc": 3, "instead": [1, 16], "instruct": [1, 2, 6, 7, 10, 13, 14], "int": 13, "int4": 9, "int4wo": 1, "integr": 9, "intellig": 14, "intent": 14, "inter": 13, "interact": 9, "interfac": [9, 11], "internlm": 1, "intuit": 9, "invok": 7, "involv": 14, "io": 0, "ip": [1, 7], "ipc": 10, "ipc_collect": 14, "is_avail": 14, "issu": [7, 10, 16], "iter_lin": 13, "itl": 13, "its": [3, 14], "japan": [7, 14], "json": [1, 2, 5, 6, 13, 14], "json_decod": 7, "json_output": 7, "json_schema": 13, "jump": 9, "just": 5, "k": 13, "k8": 10, "kei": [2, 7, 14], "kernel": [9, 10, 16], "kfd": 15, "kill": 14, "killpg": 14, "kingdom": 7, "kv": [1, 8], "l": 13, "l102": 16, "l4": 10, "l40": 10, "l92": 16, "lab": [1, 13], "label": [14, 15], "lang": 13, "languag": [5, 9, 10, 14], "larg": [1, 2, 8, 9, 14], "last": 10, "later": [3, 15], "latest": 10, "launch": [1, 2, 5, 7, 10, 13, 16], "launch_serv": [1, 2, 5, 6, 7, 10, 13, 14], "layer": [11, 14], "layer_id": 11, "learn": [1, 4, 11, 14], "least": 13, "len": [1, 2, 13], "length": [7, 13], "let": 1, "level": [6, 13, 14], "librari": 7, "like": [8, 14], "limit": 3, "line": 14, "lint": 4, "linux": 15, "list": [1, 2, 7, 11, 13, 14, 16], "llama": [2, 5, 7, 9, 10, 11, 13, 14], "llama3": 1, "llava": [1, 9, 13], "llava_llama_3": 1, "llm": [1, 3, 9, 14], "lmm": [1, 13], "lmsysorg": 10, "load": [1, 2, 6, 8, 13], "load_imag": 13, "local": 10, "local_example_llava_next": 7, "localhost": [0, 1, 6, 7, 13, 14], "locat": 13, "log": [6, 7, 8, 14], "logic": 13, "logit": [7, 11, 13], "logitsprocessor": 11, "logprob": [3, 13, 14], "logprob_start_len": 13, "london": 3, "long": [1, 14], "longer": 3, "longest": 8, "look": [5, 8], "loop": 7, "low": 13, "lpm": 8, "lsb": 2, "m": [0, 1, 2, 5, 6, 7, 10, 11, 13, 14], "machin": 10, "magic": 7, "mai": [1, 2, 7, 16], "main": [1, 13], "maintain": 11, "major": 11, "make": [0, 8, 9, 11, 13], "manag": 7, "mani": [3, 8, 11], "manner": 13, "mask": 7, "massiv": 14, "match": 8, "matched_stop": 14, "math": 7, "max": 13, "max_new_token": [1, 8, 13], "max_token": [1, 7, 14], "maximum": 13, "md": 4, "mean": [8, 13], "meanwhil": 5, "measur": 13, "media": 14, "median": 13, "meet": 1, "mem": [1, 13, 16], "memori": [1, 2], "messag": [1, 7, 14], "meta": [1, 2, 5, 7, 10, 13, 14], "method": 9, "millard": 3, "min_new_token": 13, "min_p": 13, "minicpm": 1, "ministri": 7, "mislead": 3, "miss": 5, "mistral": [1, 9], "mix": 13, "mixtral": 1, "modal": [1, 9], "model": [2, 3, 5, 8, 9, 10, 13, 14, 15], "model_path": 1, "moe": 1, "more": [1, 9, 10, 13, 14], "most": [5, 8, 11], "mount": 15, "muggl": 7, "mulit": 7, "multi": [1, 9], "multi_turn_quest": 7, "multipl": [1, 14], "multipli": 13, "must": 13, "my": 1, "my_model": 5, "my_model_templ": 5, "n": [7, 13, 14], "n1": 14, "n2": 14, "n3": 14, "n4": 14, "n5": 14, "name": [1, 2, 3, 5, 7, 13, 14, 15], "natur": 14, "nccl": 1, "ndescrib": 13, "need": [2, 5, 7, 10, 11, 15], "nemo": 1, "nest": 7, "network": 14, "neural": 14, "new": [1, 8, 9, 12, 14, 15], "new_token_ratio": 8, "next": [1, 6, 14], "ngener": 1, "nlarg": 14, "nllm": 14, "nlp": [1, 6], "nnode": 1, "node": [1, 2], "non": 7, "none": [6, 13, 14], "normal": 7, "note": [1, 2, 5, 11, 13, 15], "now": 7, "nsome": 14, "nsy": 2, "nt": 14, "nthe": 14, "nuanc": 14, "null": [10, 14], "num": [1, 2, 13], "number": [8, 13], "nvidia": [2, 13, 15], "nvtx": 2, "nyou": 13, "o": [2, 13, 14, 15], "object": 14, "obtain": 3, "occasion": 8, "occup": 7, "offer": 9, "offici": 5, "offlin": 1, "okai": 8, "olmo": 1, "omit": 3, "onc": [1, 3, 6, 14], "one": [3, 7, 13, 14], "onevis": [1, 13], "onli": [2, 3, 7, 10, 11, 13], "onlin": [1, 2], "only_run": 11, "oom": [8, 13], "open": [9, 10], "openai": [3, 5, 10, 13], "openai_api_kei": [7, 15], "oper": 10, "optim": 16, "option": [3, 13], "order": 7, "other": [3, 8, 10, 11, 13], "out": [1, 2, 7, 10, 16], "output": [1, 2, 11, 13], "ov": [1, 13], "overhead": [8, 13], "overlap": [3, 8], "overrid": 5, "own": [1, 10, 13], "p": [10, 13], "p2p": 1, "p99": 13, "page": [9, 16], "paragraph": 7, "parallel": [1, 8, 9, 13], "paramet": [8, 9], "pari": 3, "part": 11, "pass": [4, 7, 11, 14], "path": [0, 1, 2, 3, 5, 6, 7, 10, 13, 14], "patronu": 7, "pattern": 14, "peer": 1, "penal": 13, "per": 13, "perform": 3, "pgid": 14, "phoenix": 7, "pid": 14, "piec": 14, "pip": [0, 2, 12, 15], "pip3": 4, "plan": 10, "playground": 11, "pleas": [1, 7, 10], "png": 13, "poll": 14, "pool": [1, 8], "poorli": 3, "popen": [6, 14], "port": [1, 5, 6, 7, 10, 13, 14], "post": [13, 14], "post2": 10, "post3_vllm0": 15, "potter": 7, "power": 14, "pre": 4, "predict": 3, "prefil": [1, 2, 9, 11], "prefix": [8, 9], "prerequisit": 2, "presence_penalti": 13, "presid": [1, 3], "prev": 13, "primit": [3, 7], "print": [1, 2, 6, 7, 13, 14], "probabl": 7, "proceed": [6, 14], "process": [13, 14], "processlookuperror": 14, "product": 14, "profil": 9, "program": [9, 10], "progress_bar": 7, "project": [0, 5, 10, 12, 13, 15, 16], "prompt": [1, 2, 7, 9, 13], "prompt_token": 14, "prompt_tokens_detail": 14, "proper": 10, "provid": [1, 2, 7, 9, 10], "pub": 2, "pull": 15, "pure": 7, "py": [0, 1, 2, 5, 7, 11, 12, 13, 16], "pydant": 7, "pyproject": 12, "python": [1, 2, 5, 6, 7, 10, 12, 13, 14, 16], "python3": [0, 1, 2, 6, 10, 11, 13, 15], "pytorch": 10, "q": 7, "quantiz": [1, 9], "queri": 1, "question": [7, 14], "question_1": 7, "question_2": 7, "queue": 8, "quick": 2, "quick_start": 7, "qwen": [1, 9, 11], "qwen2": [1, 6, 11, 13], "r": [0, 1, 7], "radix": 2, "radixattent": [9, 11], "ran": 13, "random": [2, 13], "rang": [8, 9, 14], "rank": 1, "rate": 13, "ravenclaw": 7, "raw": 13, "reach": 13, "readi": [6, 14], "readm": 4, "readme_exampl": 7, "real": [1, 2], "recommend": [2, 10, 13, 14], "recoveri": 10, "reduc": [1, 8], "refer": [1, 11], "reference_hf": 11, "refus": 14, "regex": [7, 13], "regist": 5, "regular": [7, 13], "regular_expression_gen": 7, "relat": [5, 10], "relationship": 14, "releas": [2, 10], "relev": 13, "remot": 10, "remov": [0, 11], "repeat": 13, "repetition_penalti": 13, "replac": [1, 10, 11], "repo": 2, "report": [1, 16], "req": [8, 13], "request": [1, 6, 7, 13], "requestexcept": [6, 14], "requir": 0, "resourc": [10, 11], "respons": [1, 3, 6, 13, 14], "restart": 15, "result": [3, 13], "retoken": 13, "retracted_req": 8, "return": 13, "return_logprob": 13, "return_text_in_logprob": 13, "reus": 11, "rid": 13, "rm": 15, "rmsnorm": 11, "role": [1, 14], "root": 10, "run": [0, 2, 4, 6, 7, 11, 13, 14], "run_batch": 7, "runner_allow_runasroot": 15, "running_request": 13, "runtim": [9, 10], "runtimeendpoint": [3, 7], "same": [1, 2, 7, 11, 13], "sampl": [9, 10, 11, 16], "sampling_param": [1, 13], "scale": [10, 13], "schema": [7, 13], "script": 11, "search": 7, "secret": 10, "section": 13, "see": [1, 7, 8, 10, 13], "select": [7, 10], "self": 14, "semant": 14, "send": [1, 8, 13], "sentenc": 13, "sep": 5, "sep_styl": 5, "serv": [1, 2, 8, 9, 10, 13], "server": [0, 2, 5, 7, 8, 13], "server_arg": 16, "server_process": 14, "servic": 10, "service_ti": 14, "set": [1, 2, 5, 7, 10, 13, 14, 16], "set_default_backend": 7, "sever": [1, 2], "sgl": [0, 1, 3, 7, 10, 12, 13, 15, 16], "sgl0": 15, "sglang": [2, 4, 6, 12, 14, 15, 16], "sglang_is_in_ci": 15, "sglang_use_modelscop": 1, "sh": 12, "share": [8, 15], "shell": 6, "shm": 15, "short": 13, "shorter": [3, 14], "should": [5, 11], "show": 7, "sigkil": 14, "signal": 14, "sigterm": 14, "siluandmul": 11, "similar": [11, 13], "simpl": 7, "simpli": 3, "sinc": 13, "singl": [1, 2, 10, 11, 13], "siri": 14, "size": [1, 2, 15], "sk": [7, 15], "skip": 13, "skip_special_token": 13, "sky": 10, "skyserv": 10, "sleep": [6, 14, 15], "slytherin": 7, "sm75": 10, "small": [1, 8], "smaller": [1, 16], "smollm": 1, "snippet": 2, "so": [1, 2, 13], "social": 14, "some": [2, 7, 11, 13, 15, 16], "sometim": 16, "sourc": [2, 9], "space": 13, "spaces_between_special_token": 13, "special": 13, "specif": [1, 10, 11], "specifi": [1, 3, 5, 7, 13, 14, 15], "srt": [9, 10, 13, 16], "stablelm": 1, "stai": 7, "stand": [8, 14], "start": [11, 13], "startswith": 13, "state": [1, 7], "static": [1, 2, 13, 16], "statu": [7, 10], "status_cod": [6, 14], "stderr": [6, 14], "stdin": 6, "stdout": [6, 14], "step": [6, 14], "still": 13, "stop": [7, 8, 13, 14], "stop_str": 5, "stop_token_id": 13, "store": 13, "stori": 14, "str": 13, "strategi": 1, "stream": 1, "string": [8, 13], "strip": 13, "strong": 3, "strongli": 14, "structur": 9, "student": 7, "subprocess": [6, 14], "subset": 3, "success": 13, "suggest": 8, "summar": 14, "summari": 7, "supervis": 14, "suppli": [3, 13], "support": [3, 7, 9, 10, 13, 14], "sure": [0, 11, 13], "switch": 10, "sxm5": 13, "sy": 6, "syntax": 14, "system": [1, 2, 5, 7, 13, 14], "system_fingerprint": 14, "t4": 10, "take": 8, "teacher": 7, "techniqu": 14, "tee": 2, "temperatur": [1, 7, 13, 14], "templat": [1, 7, 13], "temporarili": 5, "tensor": [1, 9], "termin": [10, 14], "terminate_process": 14, "test": [1, 2, 13, 14, 15], "test_generation_model": 11, "test_oth": 11, "test_vision_openai_serv": 1, "testgenerationmodel": 11, "text": [1, 6, 11, 13, 14], "text_it": 7, "text_qa": 7, "thei": 13, "them": [10, 14, 16], "thi": [0, 1, 2, 3, 5, 6, 7, 8, 10, 11, 13, 14, 16], "thing": 8, "through": [7, 13], "throughput": [1, 13], "time": [1, 2, 6, 13, 14], "timeout": 14, "timeoutexpir": 14, "tip": 16, "tip_suggest": 7, "tmp": 15, "todai": [1, 6], "togeth": [1, 8], "tok": 13, "token": [1, 5, 7, 8, 9, 10], "token_id": 13, "token_length_norm": 3, "tokyo": 14, "toml": 12, "tone": 14, "too": 8, "tool": 7, "tool_cal": 14, "tool_us": 7, "top": 13, "top_k": 13, "top_logprobs_num": 13, "top_p": [1, 13], "topic": 14, "torch": [1, 8, 14], "torch2": 10, "torchao": 1, "total": [1, 13], "total_token": 14, "tp": 1, "tpot": 13, "tr": 2, "trace": 2, "traffic": 13, "train": [2, 14], "transform": 11, "translat": 14, "triton": 10, "troubleshoot": 9, "true": [1, 2, 6, 7, 13, 14, 15], "truncat": [1, 2], "try": [1, 6, 13, 14, 16], "ttft": 13, "tune": [1, 9, 13], "turbo": 7, "turn": 7, "twine": 12, "two": [1, 5, 7, 11], "txt": 0, "type": [1, 6, 14], "typic": 14, "u": 3, "ubuntu": 2, "ubuntu1804": 2, "ubuntu22": 15, "unconditional_likelihood_norm": 3, "under": [2, 4, 11], "understand": [11, 14], "union": 13, "unit": [1, 2, 7], "unittest": 11, "until": 13, "up": 10, "updat": [0, 2, 15], "upgrad": 10, "upload_pypi": 12, "upon": [1, 6], "url": 13, "us": [2, 3, 4, 5, 8, 13, 15], "us_president_exampl": 3, "usag": [1, 3, 8, 14], "user": [1, 3, 5, 7, 8, 13, 14], "usual": 13, "utf": 13, "util": [8, 13], "v": [10, 15], "v0": 10, "v1": [1, 6, 14], "valu": [1, 8, 13, 16], "valuabl": 11, "variabl": [1, 15], "variant": 2, "variou": 1, "vast": 14, "veri": [8, 11, 13], "version": [10, 14], "vertexai": 7, "video": 15, "view": 1, "virtual": 14, "vision": [1, 9], "visit": 0, "vl": 1, "vocab_s": 13, "w": 7, "wai": 11, "wait": 14, "wand": 7, "want": [1, 13], "warn": [8, 14], "we": [1, 13], "weight": [1, 2, 15], "welcom": 5, "well": 11, "were": 13, "what": [3, 7, 14], "when": [5, 7, 8, 13, 16], "where": [3, 14], "whether": 13, "which": [8, 13, 14], "while": [1, 2, 6, 10, 13, 14, 15], "whl": 10, "wide": [9, 14], "within": 7, "without": [2, 10, 14], "wood": 7, "word": 7, "work": [1, 5, 8, 15], "workflow": 7, "workload": 8, "write": 0, "x64": 15, "x86_64": 2, "xvers": 1, "xxx": 15, "y": [2, 15], "yaml": 10, "yi": 1, "yml": 10, "you": [0, 1, 2, 4, 5, 6, 7, 8, 10, 11, 13, 14, 15], "your": [0, 1, 5, 7, 9, 10, 13, 14], "zip": 1}, "titles": ["SGLang Documentation", "Backend: SGLang Runtime (SRT)", "Benchmark and Profiling", "Choices Methods in SGLang", "Contributor Guide", "Custom Chat Template in SGLang Runtime", "Embedding Model", "Frontend: Structured Generation Language (SGLang)", "Guide on Hyperparameter Tuning", "SGLang Documentation", "Install SGLang", "How to Support a New Model", "PyPI Package Release Process", "Sampling Parameters in SGLang Runtime", "Quick Start", "Set Up Self-hosted Runners for GitHub Action", "Troubleshooting"], "titleterms": {"1": [1, 10, 15], "2": [10, 15], "3": [1, 10, 15], "4": 10, "405b": 1, "5": 10, "A": 6, "The": 16, "With": 10, "access": 16, "achiev": 8, "action": 15, "add": [4, 11, 15], "addit": 1, "advanc": 8, "all": 13, "an": 16, "api": [1, 6, 14], "argument": 1, "avoid": 8, "backend": [1, 9], "baselin": 13, "batch": 7, "benchmark": [1, 2, 13], "build": 0, "chat": 5, "choic": 3, "chunk": 8, "clean": 0, "cloud": 10, "code": [4, 12], "common": 10, "compat": [1, 6, 14], "compos": 10, "config": 15, "configur": 15, "conserv": 8, "constrain": 7, "contain": 15, "contributor": 4, "control": 7, "correct": 11, "cuda": 16, "curl": 6, "custom": 5, "debug": 11, "decod": 7, "depend": 0, "deploi": 0, "detail": 7, "docker": [10, 15], "document": [0, 9], "dp": 8, "embed": 6, "encount": 16, "engin": 1, "error": 16, "exampl": [7, 13], "featur": 7, "flow": 7, "format": 4, "fraction": 8, "frequenc": 13, "from": [1, 10, 11], "frontend": [7, 9], "gener": 7, "get": 9, "github": [12, 15], "greedi": 3, "guid": [4, 8], "hang": 16, "host": 15, "how": 11, "http": 1, "hyperparamet": 8, "illeg": 16, "implement": 7, "implic": 13, "instal": 10, "interact": 11, "json": 7, "kubernet": 10, "languag": 7, "latenc": 13, "launch": [6, 14], "length": 3, "likelihood": 3, "llama": 1, "local": 7, "make": 12, "max": 8, "mem": 8, "memori": [8, 13, 16], "method": [3, 10], "min": 13, "minor": 8, "modal": [7, 13], "model": [1, 6, 7, 11], "modelscop": 1, "more": 7, "multi": [7, 13], "new": [11, 13], "normal": [3, 13], "note": 10, "nsight": 2, "openai": [1, 6, 7, 14], "option": 8, "other": 2, "out": 8, "packag": 12, "parallel": 7, "paramet": 13, "peak": 8, "penalti": 13, "perform": [1, 13], "pip": 10, "polici": 8, "port": 11, "prefil": 8, "presenc": 13, "preview": 0, "process": 12, "profil": 2, "pypi": 12, "quick": [1, 7, 14], "refer": 9, "releas": 12, "repetit": 13, "request": [8, 14], "role": 7, "run": [1, 8, 10, 15], "runner": 15, "runtim": [1, 5, 13], "sampl": 13, "schedul": 8, "select": 3, "self": 15, "send": 14, "serv": 0, "server": [1, 6, 14, 16], "set": 15, "sglang": [0, 1, 3, 5, 7, 9, 10, 11, 13], "sh": 15, "size": 8, "skypilot": 10, "sourc": 10, "speed": 8, "srt": 1, "start": [1, 7, 9, 14, 15], "static": 8, "step": 15, "stream": [7, 13], "structur": 7, "submiss": 8, "suit": 11, "support": [1, 11], "templat": 5, "test": [4, 11], "throughput": 8, "tip": [2, 7], "togeth": 13, "token": [3, 13], "tp": 8, "troubleshoot": 16, "try": 8, "tune": 8, "tutori": 9, "uncondit": 3, "unit": 4, "up": 15, "updat": 12, "upload": 12, "us": [1, 6, 7, 10, 14], "version": 12, "vllm": 11, "wa": 16, "websit": 0, "without": 1, "your": [4, 8]}})
\ No newline at end of file
+Search.setIndex({"alltitles": {"(Minor) Tune --schedule-policy": [[8, "minor-tune-schedule-policy"]], "Achieving Peak Throughput": [[8, "achieving-peak-throughput"]], "Add Unit Tests": [[4, "add-unit-tests"]], "Add a Runner": [[15, "add-a-runner"]], "Add the model to the test suite": [[11, "add-the-model-to-the-test-suite"]], "Additional Server Arguments": [[1, "additional-server-arguments"]], "All Together": [[13, "all-together"]], "Avoid out-of-memory by tuning --chunked-prefill-size, --mem-fraction-static, --max-running-requests": [[8, "avoid-out-of-memory-by-tuning-chunked-prefill-size-mem-fraction-static-max-running-requests"]], "Backend Tutorial": [[9, null]], "Backend: SGLang Runtime (SRT)": [[1, null]], "Baseline": [[13, "baseline"]], "Batching": [[7, "batching"]], "Benchmark": [[2, "benchmark"]], "Benchmark Performance": [[1, "benchmark-performance"]], "Benchmark and Profiling": [[2, null]], "Benchmarks": [[13, "benchmarks"]], "Build": [[0, "build"]], "Build the documentation website": [[0, "build-the-documentation-website"]], "CUDA error: an illegal memory access was encountered": [[16, "cuda-error-an-illegal-memory-access-was-encountered"]], "Choices Methods in SGLang": [[3, null]], "Clean": [[0, "clean"]], "Common Notes": [[10, "common-notes"]], "Constrained Decoding": [[7, "constrained-decoding"]], "Contributor Guide": [[4, null]], "Control Flow": [[7, "control-flow"]], "Custom Chat Template in SGLang Runtime": [[5, null]], "Dependency": [[0, "dependency"]], "Deploy": [[0, "deploy"]], "Embedding Model": [[6, null]], "Engine Without HTTP Server": [[1, "engine-without-http-server"]], "Examples": [[13, "examples"]], "Format Your Code": [[4, "format-your-code"]], "Frequency Penalty": [[13, "frequency-penalty"]], "Frontend Tutorial": [[9, null]], "Frontend: Structured Generation Language (SGLang)": [[7, null]], "Getting Started": [[9, null]], "Greedy Token Selection": [[3, "greedy-token-selection"]], "Guide on Hyperparameter Tuning": [[8, null]], "How to Support a New Model": [[11, null]], "Install SGLang": [[10, null]], "Interactive debugging": [[11, "interactive-debugging"]], "JSON Decoding": [[7, "json-decoding"]], "Language Feature": [[7, "language-feature"]], "Latency": [[13, "latency"]], "Launch A Server": [[6, "Launch-A-Server"]], "Launch a server": [[14, "Launch-a-server"]], "Make a release in GitHub": [[12, "make-a-release-in-github"]], "Memory": [[13, "memory"]], "Method 1: With pip": [[10, "method-1-with-pip"]], "Method 2: From source": [[10, "method-2-from-source"]], "Method 3: Using docker": [[10, "method-3-using-docker"]], "Method 4: Using docker compose": [[10, "method-4-using-docker-compose"]], "Method 5: Run on Kubernetes or Clouds with SkyPilot": [[10, "method-5-run-on-kubernetes-or-clouds-with-skypilot"]], "Methods": [[3, "methods"]], "Min New Tokens": [[13, "min-new-tokens"]], "More Examples": [[7, "more-examples"]], "Multi modal": [[13, "multi-modal"]], "Multi-Modality": [[7, "multi-modality"]], "Normal": [[13, "normal"]], "OpenAI Compatible API": [[1, "openai-compatible-api"]], "Other tips": [[2, "other-tips"]], "Parallelism": [[7, "parallelism"]], "Performance Implications on Penalties": [[13, "performance-implications-on-penalties"]], "Port a model from vLLM to SGLang": [[11, "port-a-model-from-vllm-to-sglang"]], "Presence Penalty": [[13, "presence-penalty"]], "Profile with Nsight": [[2, "profile-with-nsight"]], "PyPI Package Release Process": [[12, null]], "Quick Start": [[1, "quick-start"], [7, "quick-start"], [14, null]], "References": [[9, null]], "Repetition Penalty": [[13, "repetition-penalty"]], "Roles": [[7, "roles"]], "Run Llama 3.1 405B": [[1, "run-llama-3-1-405b"]], "SGLang Documentation": [[0, null], [9, null]], "Sampling Parameters in SGLang Runtime": [[13, null]], "Send a Request": [[14, "Send-a-Request"]], "Serve (preview)": [[0, "serve-preview"]], "Set Up Self-hosted Runners for GitHub Action": [[15, null]], "Step 1: Start a docker container.": [[15, "step-1-start-a-docker-container"]], "Step 2: Configure the runner by config.sh": [[15, "step-2-configure-the-runner-by-config-sh"]], "Step 3: Run the runner by run.sh": [[15, "step-3-run-the-runner-by-run-sh"]], "Streaming": [[7, "streaming"], [13, "streaming"]], "Supported Models": [[1, "supported-models"]], "Test the correctness": [[11, "test-the-correctness"]], "The server hangs": [[16, "the-server-hangs"]], "Tips and Implementation Details": [[7, "tips-and-implementation-details"]], "Token Length Normalized": [[3, "token-length-normalized"]], "Troubleshooting": [[16, null]], "Try advanced options": [[8, "try-advanced-options"]], "Tune --dp-size and --tp-size": [[8, "tune-dp-size-and-tp-size"]], "Tune --schedule-conservativeness": [[8, "tune-schedule-conservativeness"]], "Tune Your Request Submission Speed": [[8, "tune-your-request-submission-speed"]], "Unconditional Likelihood Normalized": [[3, "unconditional-likelihood-normalized"]], "Update the version in code": [[12, "update-the-version-in-code"]], "Upload the PyPI package": [[12, "upload-the-pypi-package"]], "Use Curl": [[6, "Use-Curl"]], "Use Models From ModelScope": [[1, "use-models-from-modelscope"]], "Using Local Models": [[7, "using-local-models"]], "Using OpenAI Compatible API": [[6, "Using-OpenAI-Compatible-API"], [14, "Using-OpenAI-Compatible-API"]], "Using OpenAI Models": [[7, "using-openai-models"]]}, "docnames": ["README", "backend", "benchmark_and_profiling", "choices_methods", "contributor_guide", "custom_chat_template", "embedding_model", "frontend", "hyperparameter_tuning", "index", "install", "model_support", "release_process", "sampling_params", "send_request", "setup_github_runner", "troubleshooting"], "envversion": {"nbsphinx": 4, "sphinx": 63, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["README.md", "backend.md", "benchmark_and_profiling.md", "choices_methods.md", "contributor_guide.md", "custom_chat_template.md", "embedding_model.ipynb", "frontend.md", "hyperparameter_tuning.md", "index.rst", "install.md", "model_support.md", "release_process.md", "sampling_params.md", "send_request.ipynb", "setup_github_runner.md", "troubleshooting.md"], "indexentries": {}, "objects": {}, "objnames": {}, "objtypes": {}, "terms": {"": [3, 6, 7, 8, 10, 11, 13, 14], "0": [1, 6, 7, 8, 10, 13, 14, 15, 16], "0000": 8, "0006804466247558594": 6, "0006995201110839844": 6, "0013713836669921875": 6, "00209808349609375": 6, "003047943115234375": 6, "00603485107421875": 6, "0062103271484375": 6, "0066680908203125": 6, "00809478759765625": 6, "0083160400390625": 6, "0089874267578125": 6, "0090179443359375": 6, "01": [7, 8, 13], "01131439208984375": 6, "01238250732421875": 6, "01273345947265625": 6, "0135955810546875": 6, "0143890380859375": 6, "01552581787109375": 6, "0190582275390625": 6, "02": 13, "021759033203125": 6, "03": 13, "04": [13, 15], "05": 13, "06": 13, "08": 13, "0_rocm6": 15, "0_triton3": 15, "1": [2, 6, 7, 8, 11, 13, 14], "10": [1, 2, 6, 13], "100": 7, "101": 13, "103": 13, "104": 13, "10405": 13, "10666": 13, "107": 13, "10767": 13, "11": 13, "114": 13, "11586": 13, "117": 13, "11732": 13, "12": [10, 13, 15], "127": [1, 6, 14], "128": [1, 13], "128009": 14, "13": 13, "14226": 13, "1449c9c20d4448299431a57facc68d7a": 14, "16": [1, 7, 13], "16219": 13, "16740": 13, "16757c3dd6e14a6e9bafd1122f84e4c5": 14, "17": 13, "17125": 13, "17167": 13, "172": 1, "1729816891": 14, "1729816893": 14, "174": 13, "179": 13, "18": 13, "18895": 13, "189": 13, "191": 13, "195": 13, "19884": 13, "1st": 13, "2": [1, 5, 6, 7, 9, 13, 14], "200": [6, 14], "20000": 1, "2048": [2, 8], "205": 13, "20866": 13, "22095": 13, "22363": 13, "22603": 13, "233": [8, 13], "23892": 13, "24": 13, "25": 7, "256": [1, 2, 7, 13], "26": 13, "268": 13, "27": 13, "271": 13, "29": 13, "293": 13, "3": [2, 5, 6, 7, 8, 9, 13, 14], "30": 13, "3000": 13, "30000": [1, 5, 7, 10, 13, 14], "30010": 6, "308": 13, "31": 13, "317": 8, "32": [1, 2, 13], "320": 13, "34": 14, "35": 13, "36": 13, "37": 13, "370959": 8, "378633": 13, "379": 14, "38": 13, "39": [13, 14], "4": [1, 7, 14], "40": 13, "40881": 13, "409": 13, "4096": [1, 2, 8], "41": 13, "41888": 13, "426": 14, "433": 13, "43967": 13, "44": 13, "440": 13, "447": 13, "44926": 13, "45": 13, "453": 13, "45354": 13, "45445": 13, "455": 13, "4594": 8, "46": [13, 14], "46530": 13, "47": [13, 14], "47738": 13, "48302": 13, "4832": 13, "48960": 13, "49": 14, "49017": 13, "49263": 13, "5": [1, 7, 13, 14], "50": [8, 13], "500": 8, "50000": 1, "50302": 13, "5079": 13, "51": 13, "512": [2, 13], "52": 1, "5206": 13, "5255": 13, "52554": 13, "52825": 13, "52920": 13, "54": 13, "54497": 13, "55": 13, "56": 13, "5656": 13, "5727": 13, "57426": 13, "58": 13, "59": 13, "5b": 11, "6": [1, 15], "60": [2, 13], "6000": 2, "61": 13, "64": [1, 2, 13, 14], "64g": 15, "65": 13, "66": 13, "67": 13, "68": 13, "69": 13, "7": 1, "70": [2, 13], "71": 13, "72": 13, "72b": 1, "73": 13, "74": 13, "75": 13, "76": 13, "766008": 13, "774756": 13, "774955": 13, "775118": 13, "775210": 13, "775220": 13, "775651": 13, "78": 13, "79": 13, "7b": [1, 5, 6, 13], "7fa2af80": 2, "8": [1, 13, 16], "8000": 0, "81": 13, "82": 8, "83": 13, "84": 13, "8413": 13, "85": 13, "86": 13, "88": 13, "89": 13, "8b": [1, 2, 7, 10, 13, 14], "9": [1, 7, 8, 16], "90": 13, "91": 13, "93": 13, "94": 13, "95": [1, 13, 14], "96": 13, "97": 13, "98": 13, "9900": 13, "9998": 8, "A": [1, 2, 7, 8, 10], "By": [5, 13], "For": [1, 2, 3, 11, 13], "If": [1, 5, 8, 10, 13, 16], "In": [1, 7, 16], "It": [1, 3, 5, 7, 8, 9, 10, 13, 14], "NOT": 5, "On": 8, "The": [1, 2, 3, 7, 8, 9, 10, 11, 13, 15], "Then": [7, 15], "There": 5, "These": 13, "To": [0, 1, 2, 7, 8, 10, 11], "__init__": 12, "__main__": 1, "__name__": 1, "_build": 0, "a10": 10, "a100": 10, "abl": 11, "about": [1, 5, 7, 8], "abov": [2, 3, 10, 13], "acceler": [1, 8, 10], "accept": 13, "access": [0, 1, 10], "accord": [2, 7, 10], "accur": [1, 2], "across": 3, "activ": 9, "ad": 10, "add": [1, 2, 7, 8, 13, 16], "addit": [3, 7, 13], "addr": 1, "address": [1, 7], "adopt": 9, "adv": 2, "advanc": 9, "against": 3, "ai": [1, 10, 14], "alexa": 14, "algorithm": 13, "alibaba": [1, 6], "aliv": 7, "all": [0, 1, 3, 4, 7, 8, 10, 11, 15], "all_other_model": 11, "allow": [2, 10], "almost": [1, 8, 11], "also": [1, 5, 7, 8, 13], "altern": [3, 7], "alwai": [8, 14], "am": 14, "amd": 15, "amount": 14, "an": [0, 1, 3, 7, 9, 10, 13, 14, 15], "analyz": 14, "ani": [1, 7, 10, 13], "annot": 2, "anoth": [11, 14], "answer": [3, 7, 14], "answer_1": 7, "answer_2": 7, "anthrop": 7, "antidisestablishmentarian": 3, "api": [3, 5, 7, 10, 13], "api_kei": [1, 6, 14], "appear": 13, "appli": 13, "applic": [1, 6, 9, 14], "approach": 10, "apt": [2, 15], "ar": [1, 2, 3, 5, 6, 7, 8, 10, 11, 13, 14], "arch": 15, "architectur": 2, "arg": 3, "argument": [2, 7, 13], "around": 16, "articl": 14, "artifici": 14, "assign": 14, "assist": [1, 3, 5, 7, 13, 14], "assistant_begin": 7, "assistant_end": 7, "attain": 8, "attent": [9, 10, 11], "attract": [3, 7], "auror": 7, "australia": 14, "author": [6, 14], "automat": 13, "autoregress": 7, "autosc": 10, "avail": [1, 10], "averag": 3, "avoid": 10, "awq": 9, "b": 10, "back": 9, "backend": [2, 3, 10, 13, 16], "bad": 3, "baichuan2": 1, "balanc": 7, "base": [3, 13], "base64": 13, "base_url": [1, 6, 14], "bash": [12, 15], "basic": 13, "batch": [1, 2, 8, 9, 13], "bearer": [6, 14], "becaus": [7, 8], "been": 14, "befor": [2, 13], "begin": 7, "being": [8, 14], "below": [7, 10, 13, 15], "bench_lat": [1, 2, 11], "bench_serv": [1, 2, 13], "benchmark": 9, "berlin": 3, "bespok": 3, "better": [1, 8, 10, 11], "between": [1, 13], "bia": 7, "bin": 15, "blob": 13, "block": 7, "blogpost": 3, "blood": 7, "bodi": [7, 13], "bogart": 7, "bool": 13, "born": 7, "both": 8, "bottleneck": 8, "branch": 10, "bras\u00edlia": 14, "brazil": 14, "break": [6, 13, 14], "browser": 0, "build": [1, 10, 12], "built": 10, "c": [6, 10], "cach": [1, 2, 8, 9, 10, 15], "calcul": 7, "call": [3, 7, 9, 14], "can": [1, 2, 3, 4, 5, 7, 8, 10, 11, 13, 14, 15], "canberra": 14, "cannot": 13, "capabl": 14, "capit": [1, 3, 7, 13, 14], "case": [8, 16], "cd": [4, 10, 12], "chain": 9, "chang": [11, 15], "charact": 7, "character_gen": 7, "character_regex": 7, "characterist": 14, "chat": [1, 7, 13, 14], "chat_exampl": 7, "chat_templ": 5, "chatbot": 14, "chatcomplet": 14, "chatcompletionmessag": 14, "chatglm": 1, "chatml": [1, 5, 13], "check": [1, 10], "checkpoint": [1, 2], "choic": [7, 9, 14], "choices_method": 3, "chunk": [1, 9, 13], "ci": 4, "class": 13, "cli": 2, "client": [1, 2, 6, 14], "clone": [0, 10], "cluster": 10, "co": 9, "code": [2, 7, 14], "collect": 14, "color": 2, "com": [2, 10, 12, 13, 15], "come": [8, 13], "command": [1, 2, 4, 10, 11, 14, 15], "commit": 4, "common": 16, "commun": 9, "compar": 11, "comparison": [3, 11], "compat": [5, 7, 13], "compil": [1, 8], "complet": [1, 7, 14], "completion_token": 14, "completionusag": 14, "complex": 7, "comprehend": 14, "comput": [2, 7, 8, 13], "conda": 10, "confid": 3, "config": [1, 2], "connect": [7, 10], "consid": [2, 13], "constrain": [8, 9, 13], "constraint": 7, "contain": 3, "content": [1, 6, 7, 14], "context": 14, "contextu": 14, "continu": [7, 9], "contribut": 5, "contributor": 9, "control": 9, "convers": [5, 14], "convert": 11, "copi": 10, "core": [7, 9], "correct": [2, 13], "could": 13, "countri": [1, 14], "coverag": 11, "cpu": 8, "creat": [1, 6, 11, 14], "critic": 2, "cu121": 10, "cuda": [1, 2, 10, 13, 14, 15], "cuda_visible_devic": 15, "curl": [1, 13, 14, 15], "currenli": [1, 8], "custom": 1, "d": [0, 1, 2, 6, 7, 10, 14], "data": [1, 6, 8, 13, 14], "dataclass": 13, "dataset": [2, 13, 14], "dbrx": 1, "dc9d06d886151707f97d0b78095df9de262fd3c9": 13, "deactiv": 10, "deadlock": 1, "death": 7, "deb": 2, "deceas": 7, "decod": [8, 9, 13], "decode_unicod": 13, "decor": 7, "decreas": 8, "deep": 14, "deepseek": [1, 9], "def": [1, 3, 7, 14], "default": [1, 3, 5, 8, 10, 13, 16], "defin": [5, 7], "delai": 2, "depend": 10, "deploi": 10, "deploy": 10, "describ": [3, 13], "descript": [2, 13, 14], "design": [9, 14], "desir": 13, "detail": 13, "detailed_tip": 7, "determin": 3, "detoken": 13, "dev": [1, 15], "devel": 15, "develop": 2, "devic": [1, 10, 15], "devnul": [6, 14], "devtool": 2, "dict": 13, "diet": 7, "differ": 11, "difficult": 13, "digest": 14, "directli": 1, "directori": 11, "disabl": [1, 2, 13, 16], "dislik": 13, "distrib_releas": 2, "dn": 7, "do": [2, 8, 13, 15], "doc": [2, 3, 10, 13], "doc_site_path": 0, "dockerfil": 10, "document": [5, 10], "doe": [1, 2, 8], "donald": 3, "done": [13, 15], "down": 3, "download": [2, 13], "dp": 1, "dpkg": 2, "drawback": 13, "dri": 15, "dtype": 1, "duck": 3, "due": [3, 8, 16], "dummi": 2, "durat": [2, 13], "dure": [1, 8, 13, 14], "dynam": [1, 2], "e": [2, 10, 11, 14, 15], "e2": 13, "e5": [1, 9], "each": 1, "earli": 8, "earlier": 3, "easi": [9, 11, 16], "easier": 7, "eater": 7, "echo": [2, 15], "edit": 15, "effici": [1, 9], "either": 13, "element": 6, "eleutherai": 3, "elif": 7, "els": 14, "embed": [1, 9], "embedding_process": 6, "empti": 1, "empty_cach": 14, "enabl": [1, 7, 8, 10, 14], "encod": 13, "encount": 10, "encourag": 13, "end": [7, 11, 13], "endpoint": [1, 10, 13], "engag": 14, "engin": 7, "enough": [1, 8], "entryclass": 11, "enumer": 7, "env": 10, "environ": [1, 15], "eo": [8, 13], "equival": [6, 14], "error": [1, 6, 8, 14], "etc": [2, 9], "eth0": 1, "even": [3, 14], "everi": 13, "exampl": [1, 3, 6, 11, 14, 15], "example_imag": 13, "exaon": 1, "except": [6, 14], "excl": 13, "exec": 2, "execut": [10, 14], "exercis": 7, "exist": 11, "expand": 7, "experiment": 8, "explicit": 14, "export": [0, 1, 7, 15], "express": [7, 13], "extend": 3, "extens": [9, 11], "extern": [7, 9], "extra": 13, "f": [1, 7, 14], "face": [1, 5], "fail": 3, "failur": 10, "fals": 13, "far": 13, "fast": 9, "faster": 9, "favor": 8, "fcf": 8, "featur": [1, 9], "feed": 14, "fetch": 2, "file": [0, 2, 4, 11, 13], "fill": 7, "fillmor": 3, "final": 14, "find": [7, 11, 13], "finish_reason": 14, "first": [1, 2, 6, 7, 8, 13], "fix": 16, "flashinf": [9, 10], "flexibl": 9, "float": 13, "flow": 9, "flush": [7, 13], "folder": [2, 4, 15], "follow": [1, 2, 5, 7, 8, 11, 13, 15], "forev": 15, "fork": [2, 7], "format": [2, 7, 13], "forward": [9, 11], "forward_batch": 11, "found": 7, "fp16": 1, "fp8": [1, 8, 9], "fp8_e5m2": 1, "fraction": [1, 13, 16], "framework": 9, "franc": [1, 3, 7, 13], "frequency_penalti": 13, "frequent": 8, "from": [4, 5, 7, 14], "frontend": [5, 10], "full": [1, 8], "function": [3, 7, 11], "function_cal": 14, "further": 10, "futur": [1, 11], "g": [2, 10, 11, 15], "gc": 14, "gemini": 7, "gemma": [1, 9], "gen": [3, 7, 8], "gener": [0, 1, 9, 13, 14], "generatereqinput": 13, "get": [6, 10, 11, 13, 14], "getpgid": 14, "git": [10, 15], "github": [0, 10, 13], "give": [11, 15], "given": 13, "glimps": 13, "gloo_socket_ifnam": 1, "gnupg": 2, "good": 8, "googl": [7, 14], "gpt": 7, "gptq": 9, "gpu": [1, 8, 10, 13, 15], "graph": [1, 2, 16], "greedy_token_select": 3, "grok": 1, "group": 15, "gryffindor": 7, "gte": [1, 6], "guid": [9, 10, 13], "h": [1, 6, 14], "h100": [10, 13], "ha": [11, 14], "haisgl": 15, "half": 7, "hand": 8, "handl": [1, 2, 13], "happen": 8, "hardwar": 13, "harri": 7, "have": [0, 1, 3, 8, 13, 14], "header": [6, 14], "healthi": [7, 8], "hello": 1, "help": [1, 7, 8, 11, 13, 14], "henryx": 15, "here": [1, 7, 14], "hf": 5, "hf_home": 15, "hf_token": [10, 15], "hf_xxx": 15, "high": [3, 8, 13], "higher": 13, "highest": [3, 7], "hit": 13, "host": [6, 10, 14], "hostnam": 1, "hous": 7, "how": [1, 3, 4, 6, 7, 9], "html": [0, 2], "http": [0, 2, 6, 7, 10, 12, 13, 14, 15], "hub": 10, "hufflepuff": 7, "hug": [1, 5], "huggingfac": [10, 11, 15], "human": 14, "hyperparamet": [1, 9], "i": [1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 13, 14, 16], "id": [13, 14], "ident": 11, "ignor": 13, "ignore_eo": 13, "im_end": [5, 13], "im_start": [5, 13], "imag": [7, 10, 13], "image_data": 13, "image_fil": 7, "image_id": 10, "image_qa": 7, "implement": [3, 11], "import": [1, 2, 6, 7, 8, 13, 14], "improv": 14, "includ": [7, 9, 14], "incorrect": 3, "increas": 8, "incur": 3, "independ": 10, "index": [2, 14], "indic": 8, "industri": 9, "inf": 13, "infer": [1, 13], "inform": [7, 13], "infra": 10, "init": 1, "initi": [3, 14], "input": [1, 2, 6, 7, 9, 13, 14], "input_id": 13, "insid": 15, "instal": [0, 2, 4, 9, 12, 15], "installationguid": 2, "instanc": 3, "instead": [1, 16], "instruct": [1, 2, 6, 7, 10, 13, 14], "int": 13, "int4": 9, "int4wo": 1, "integr": 9, "intellig": 14, "intent": 14, "inter": 13, "interact": 9, "interfac": [9, 11], "internlm": 1, "intuit": 9, "invok": 7, "involv": 14, "io": 0, "ip": [1, 7], "ipc": 10, "ipc_collect": 14, "is_avail": 14, "issu": [7, 10, 16], "iter_lin": 13, "itl": 13, "its": [3, 14], "japan": [7, 14], "json": [1, 2, 5, 6, 13, 14], "json_decod": 7, "json_output": 7, "json_schema": 13, "jump": 9, "just": 5, "k": 13, "k8": 10, "kei": [2, 7, 14], "kernel": [9, 10, 16], "kfd": 15, "kill": 14, "killpg": 14, "kingdom": 7, "kv": [1, 8], "l": 13, "l4": 10, "l40": 10, "lab": [1, 13], "label": [14, 15], "lang": 13, "languag": [5, 9, 10, 14], "larg": [1, 2, 8, 9, 14], "last": 10, "later": [3, 15], "latest": 10, "launch": [1, 2, 5, 7, 10, 13, 16], "launch_serv": [1, 2, 5, 6, 7, 10, 13, 14], "layer": [11, 14], "layer_id": 11, "learn": [1, 4, 11, 14], "least": 13, "len": [1, 2, 13], "length": [7, 13], "let": 1, "level": [6, 13, 14], "librari": 7, "like": [8, 14], "limit": 3, "line": 14, "lint": 4, "linux": 15, "list": [1, 2, 7, 11, 13, 14, 16], "llama": [2, 5, 7, 9, 10, 11, 13, 14], "llama3": 1, "llava": [1, 9, 13], "llava_llama_3": 1, "llm": [1, 3, 9, 14], "lmm": [1, 13], "lmsysorg": 10, "load": [1, 2, 6, 8, 13], "load_imag": 13, "local": 10, "local_example_llava_next": 7, "localhost": [0, 1, 6, 7, 13, 14], "locat": 13, "log": [6, 7, 8, 14], "logic": 13, "logit": [7, 11, 13], "logitsprocessor": 11, "logprob": [3, 13, 14], "logprob_start_len": 13, "london": 3, "long": [1, 14], "longer": 3, "longest": 8, "look": [5, 8], "loop": 7, "low": 13, "lpm": 8, "lsb": 2, "m": [0, 1, 2, 5, 6, 7, 10, 11, 13, 14], "machin": 10, "magic": 7, "mai": [1, 2, 7, 16], "main": [1, 13], "maintain": 11, "major": 11, "make": [0, 8, 9, 11, 13], "manag": 7, "mani": [3, 8, 11], "manner": 13, "mask": 7, "massiv": 14, "match": 8, "matched_stop": 14, "math": 7, "max": 13, "max_new_token": [1, 8, 13], "max_token": [1, 7, 14], "maximum": 13, "md": 4, "mean": [8, 13], "meanwhil": 5, "measur": 13, "media": 14, "median": 13, "meet": 1, "mem": [1, 13, 16], "memori": [1, 2], "messag": [1, 7, 14], "meta": [1, 2, 5, 7, 10, 13, 14], "method": 9, "millard": 3, "min_new_token": 13, "min_p": 13, "minicpm": 1, "ministri": 7, "mislead": 3, "miss": 5, "mistral": [1, 9], "mix": 13, "mixtral": 1, "modal": [1, 9], "model": [2, 3, 5, 8, 9, 10, 13, 14, 15], "model_path": 1, "moe": 1, "more": [1, 9, 10, 13, 14], "most": [5, 8, 11], "mount": 15, "muggl": 7, "mulit": 7, "multi": [1, 9], "multi_turn_quest": 7, "multipl": [1, 14], "multipli": 13, "must": 13, "my": 1, "my_model": 5, "my_model_templ": 5, "n": [7, 13, 14], "n1": 14, "n2": 14, "n3": 14, "n4": 14, "n5": 14, "name": [1, 2, 3, 5, 7, 13, 14, 15], "natur": 14, "nccl": 1, "ndescrib": 13, "need": [2, 5, 7, 10, 11, 15], "nemo": 1, "nest": 7, "network": 14, "neural": 14, "new": [1, 8, 9, 12, 14, 15], "new_token_ratio": 8, "next": [1, 6, 14], "ngener": 1, "nlarg": 14, "nllm": 14, "nlp": [1, 6], "nnode": 1, "node": [1, 2], "non": 7, "none": [6, 13, 14], "normal": 7, "note": [1, 2, 5, 11, 13, 15], "now": 7, "nsome": 14, "nsy": 2, "nt": 14, "nthe": 14, "nuanc": 14, "null": [10, 14], "num": [1, 2, 13], "number": [8, 13], "nvidia": [2, 13, 15], "nvtx": 2, "nyou": 13, "o": [2, 13, 14, 15], "object": 14, "obtain": 3, "occasion": 8, "occup": 7, "offer": 9, "offici": 5, "offlin": 1, "okai": 8, "olmo": 1, "omit": 3, "onc": [1, 3, 6, 14], "one": [3, 7, 13, 14], "onevis": [1, 13], "onli": [2, 3, 7, 10, 11, 13], "onlin": [1, 2], "only_run": 11, "oom": [8, 13], "open": [9, 10], "openai": [3, 5, 10, 13], "openai_api_kei": [7, 15], "oper": 10, "optim": 16, "option": [3, 13], "order": 7, "other": [3, 8, 10, 11, 13], "out": [1, 2, 7, 10, 16], "output": [1, 2, 11, 13], "ov": [1, 13], "overhead": [8, 13], "overlap": [3, 8], "overrid": 5, "own": [1, 10, 13], "p": [10, 13], "p2p": 1, "p99": 13, "page": [9, 16], "paragraph": 7, "parallel": [1, 8, 9, 13], "paramet": [8, 9], "pari": 3, "part": 11, "pass": [4, 7, 11, 14], "path": [0, 1, 2, 3, 5, 6, 7, 10, 13, 14], "patronu": 7, "pattern": 14, "peer": 1, "penal": 13, "per": 13, "perform": 3, "pgid": 14, "phoenix": 7, "pid": 14, "piec": 14, "pip": [0, 2, 12, 15], "pip3": 4, "plan": 10, "playground": 11, "pleas": [1, 7, 10], "png": 13, "poll": 14, "pool": [1, 8], "poorli": 3, "popen": [6, 14], "port": [1, 5, 6, 7, 10, 13, 14], "post": [13, 14], "post2": 10, "post3_vllm0": 15, "potter": 7, "power": 14, "pre": 4, "predict": 3, "prefil": [1, 2, 9, 11], "prefix": [8, 9], "prerequisit": 2, "presence_penalti": 13, "presid": [1, 3], "prev": 13, "primit": [3, 7], "print": [1, 2, 6, 7, 13, 14], "probabl": 7, "proceed": [6, 14], "process": [13, 14], "processlookuperror": 14, "product": 14, "profil": 9, "program": [9, 10], "progress_bar": 7, "project": [0, 5, 10, 12, 13, 15], "prompt": [1, 2, 7, 9, 13], "prompt_token": 14, "prompt_tokens_detail": 14, "proper": 10, "provid": [1, 2, 7, 9, 10], "pub": 2, "pull": 15, "pure": 7, "py": [0, 1, 2, 5, 7, 11, 12, 13], "pydant": 7, "pyproject": 12, "python": [1, 2, 5, 6, 7, 10, 12, 13, 14], "python3": [0, 1, 2, 6, 10, 11, 13, 15], "pytorch": [10, 16], "q": 7, "quantiz": [1, 9], "queri": 1, "question": [7, 14], "question_1": 7, "question_2": 7, "queue": 8, "quick": 2, "quick_start": 7, "qwen": [1, 9, 11], "qwen2": [1, 6, 11, 13], "r": [0, 1, 7], "radix": 2, "radixattent": [9, 11], "ran": 13, "random": [2, 13], "rang": [8, 9, 14], "rank": 1, "rate": 13, "ravenclaw": 7, "raw": 13, "reach": 13, "readi": [6, 14], "readm": 4, "readme_exampl": 7, "real": [1, 2], "recommend": [2, 10, 13, 14], "recoveri": 10, "reduc": [1, 8], "refer": [1, 11], "reference_hf": 11, "refus": 14, "regex": [7, 13], "regist": 5, "regular": [7, 13], "regular_expression_gen": 7, "relat": [5, 10], "relationship": 14, "releas": [2, 10], "relev": 13, "remot": 10, "remov": [0, 11], "repeat": 13, "repetition_penalti": 13, "replac": [1, 10, 11], "repo": 2, "report": [1, 16], "req": [8, 13], "request": [1, 6, 7, 13], "requestexcept": [6, 14], "requir": 0, "resourc": [10, 11], "respons": [1, 3, 6, 13, 14], "restart": 15, "result": [3, 13], "retoken": 13, "retracted_req": 8, "return": 13, "return_logprob": 13, "return_text_in_logprob": 13, "reus": 11, "rid": 13, "rm": 15, "rmsnorm": 11, "role": [1, 14], "root": 10, "run": [0, 2, 4, 6, 7, 11, 13, 14], "run_batch": 7, "runner_allow_runasroot": 15, "running_request": 13, "runtim": [9, 10], "runtimeendpoint": [3, 7], "same": [1, 2, 7, 11, 13], "sampl": [9, 10, 11, 16], "sampling_param": [1, 13], "scale": [10, 13], "schema": [7, 13], "script": 11, "search": 7, "secret": 10, "section": 13, "see": [1, 7, 8, 10, 13], "select": [7, 10], "self": 14, "semant": 14, "send": [1, 8, 13], "sentenc": 13, "sep": 5, "sep_styl": 5, "serv": [1, 2, 8, 9, 10, 13], "server": [0, 2, 5, 7, 8, 13], "server_process": 14, "servic": 10, "service_ti": 14, "set": [1, 2, 5, 7, 10, 13, 14, 16], "set_default_backend": 7, "sever": [1, 2], "sgl": [0, 1, 3, 7, 10, 12, 13, 15], "sgl0": 15, "sglang": [2, 4, 6, 12, 14, 15], "sglang_is_in_ci": 15, "sglang_use_modelscop": 1, "sh": 12, "share": [8, 15], "shell": 6, "shm": 15, "short": 13, "shorter": [3, 14], "should": [5, 11], "show": 7, "sigkil": 14, "signal": 14, "sigterm": 14, "siluandmul": 11, "similar": [11, 13], "simpl": 7, "simpli": 3, "sinc": 13, "singl": [1, 2, 10, 11, 13], "siri": 14, "size": [1, 2, 15], "sk": [7, 15], "skip": 13, "skip_special_token": 13, "sky": 10, "skyserv": 10, "sleep": [6, 14, 15], "slytherin": 7, "sm75": 10, "small": [1, 8], "smaller": [1, 16], "smollm": 1, "snippet": 2, "so": [1, 2, 13], "social": 14, "some": [2, 7, 11, 13, 15, 16], "sometim": 16, "sourc": [2, 9], "space": 13, "spaces_between_special_token": 13, "special": 13, "specif": [1, 10, 11], "specifi": [1, 3, 5, 7, 13, 14, 15], "srt": [9, 10, 13], "stablelm": 1, "stai": 7, "stand": [8, 14], "start": [11, 13], "startswith": 13, "state": [1, 7], "static": [1, 2, 13, 16], "statu": [7, 10], "status_cod": [6, 14], "stderr": [6, 14], "stdin": 6, "stdout": [6, 14], "step": [6, 14], "still": 13, "stop": [7, 8, 13, 14], "stop_str": 5, "stop_token_id": 13, "store": 13, "stori": 14, "str": 13, "strategi": 1, "stream": 1, "string": [8, 13], "strip": 13, "strong": 3, "strongli": 14, "structur": 9, "student": 7, "subprocess": [6, 14], "subset": 3, "success": 13, "suggest": 8, "summar": 14, "summari": 7, "supervis": 14, "suppli": [3, 13], "support": [3, 7, 9, 10, 13, 14], "sure": [0, 11, 13], "switch": 10, "sxm5": 13, "sy": 6, "syntax": 14, "system": [1, 2, 5, 7, 13, 14], "system_fingerprint": 14, "t4": 10, "take": 8, "teacher": 7, "techniqu": 14, "tee": 2, "temperatur": [1, 7, 13, 14], "templat": [1, 7, 13], "temporarili": 5, "tensor": [1, 9], "termin": [10, 14], "terminate_process": 14, "test": [1, 2, 13, 14, 15], "test_generation_model": 11, "test_oth": 11, "test_vision_openai_serv": 1, "testgenerationmodel": 11, "text": [1, 6, 11, 13, 14], "text_it": 7, "text_qa": 7, "thei": 13, "them": [10, 14, 16], "thi": [0, 1, 2, 3, 5, 6, 7, 8, 10, 11, 13, 14, 16], "thing": 8, "through": [7, 13], "throughput": [1, 13], "time": [1, 2, 6, 13, 14], "timeout": 14, "timeoutexpir": 14, "tip": 16, "tip_suggest": 7, "tmp": 15, "todai": [1, 6], "togeth": [1, 8], "tok": 13, "token": [1, 5, 7, 8, 9, 10], "token_id": 13, "token_length_norm": 3, "tokyo": 14, "toml": 12, "tone": 14, "too": 8, "tool": 7, "tool_cal": 14, "tool_us": 7, "top": 13, "top_k": 13, "top_logprobs_num": 13, "top_p": [1, 13], "topic": 14, "torch": [1, 8, 14], "torch2": 10, "torchao": 1, "total": [1, 13], "total_token": 14, "tp": 1, "tpot": 13, "tr": 2, "trace": 2, "traffic": 13, "train": [2, 14], "transform": 11, "translat": 14, "triton": 10, "troubleshoot": 9, "true": [1, 2, 6, 7, 13, 14, 15], "truncat": [1, 2], "try": [1, 6, 13, 14, 16], "ttft": 13, "tune": [1, 9, 13], "turbo": 7, "turn": 7, "twine": 12, "two": [1, 5, 7, 11], "txt": 0, "type": [1, 6, 14], "typic": 14, "u": 3, "ubuntu": 2, "ubuntu1804": 2, "ubuntu22": 15, "unconditional_likelihood_norm": 3, "under": [2, 4, 11], "understand": [11, 14], "union": 13, "unit": [1, 2, 7], "unittest": 11, "until": 13, "up": 10, "updat": [0, 2, 15], "upgrad": 10, "upload_pypi": 12, "upon": [1, 6], "url": 13, "us": [2, 3, 4, 5, 8, 13, 15], "us_president_exampl": 3, "usag": [1, 3, 8, 14], "user": [1, 3, 5, 7, 8, 13, 14], "usual": 13, "utf": 13, "util": [8, 13], "v": [10, 15], "v0": 10, "v1": [1, 6, 14], "valu": [1, 8, 13, 16], "valuabl": 11, "variabl": [1, 15], "variant": 2, "variou": 1, "vast": 14, "veri": [8, 11, 13], "version": [10, 14], "vertexai": 7, "video": 15, "view": 1, "virtual": 14, "vision": [1, 9], "visit": 0, "vl": 1, "vocab_s": 13, "w": 7, "wai": 11, "wait": 14, "wand": 7, "want": [1, 13], "warn": [8, 14], "we": [1, 13], "weight": [1, 2, 15], "welcom": 5, "well": 11, "were": 13, "what": [3, 7, 14], "when": [5, 7, 8, 13, 16], "where": [3, 14], "whether": 13, "which": [8, 13, 14], "while": [1, 2, 6, 10, 13, 14, 15], "whl": 10, "wide": [9, 14], "within": 7, "without": [2, 10, 14], "wood": 7, "word": 7, "work": [1, 5, 8, 15], "workflow": 7, "workload": 8, "write": 0, "x64": 15, "x86_64": 2, "xvers": 1, "xxx": 15, "y": [2, 15], "yaml": 10, "yi": 1, "yml": 10, "you": [0, 1, 2, 4, 5, 6, 7, 8, 10, 11, 13, 14, 15], "your": [0, 1, 5, 7, 9, 10, 13, 14], "zip": 1}, "titles": ["SGLang Documentation", "Backend: SGLang Runtime (SRT)", "Benchmark and Profiling", "Choices Methods in SGLang", "Contributor Guide", "Custom Chat Template in SGLang Runtime", "Embedding Model", "Frontend: Structured Generation Language (SGLang)", "Guide on Hyperparameter Tuning", "SGLang Documentation", "Install SGLang", "How to Support a New Model", "PyPI Package Release Process", "Sampling Parameters in SGLang Runtime", "Quick Start", "Set Up Self-hosted Runners for GitHub Action", "Troubleshooting"], "titleterms": {"1": [1, 10, 15], "2": [10, 15], "3": [1, 10, 15], "4": 10, "405b": 1, "5": 10, "A": 6, "The": 16, "With": 10, "access": 16, "achiev": 8, "action": 15, "add": [4, 11, 15], "addit": 1, "advanc": 8, "all": 13, "an": 16, "api": [1, 6, 14], "argument": 1, "avoid": 8, "backend": [1, 9], "baselin": 13, "batch": 7, "benchmark": [1, 2, 13], "build": 0, "chat": 5, "choic": 3, "chunk": 8, "clean": 0, "cloud": 10, "code": [4, 12], "common": 10, "compat": [1, 6, 14], "compos": 10, "config": 15, "configur": 15, "conserv": 8, "constrain": 7, "contain": 15, "contributor": 4, "control": 7, "correct": 11, "cuda": 16, "curl": 6, "custom": 5, "debug": 11, "decod": 7, "depend": 0, "deploi": 0, "detail": 7, "docker": [10, 15], "document": [0, 9], "dp": 8, "embed": 6, "encount": 16, "engin": 1, "error": 16, "exampl": [7, 13], "featur": 7, "flow": 7, "format": 4, "fraction": 8, "frequenc": 13, "from": [1, 10, 11], "frontend": [7, 9], "gener": 7, "get": 9, "github": [12, 15], "greedi": 3, "guid": [4, 8], "hang": 16, "host": 15, "how": 11, "http": 1, "hyperparamet": 8, "illeg": 16, "implement": 7, "implic": 13, "instal": 10, "interact": 11, "json": 7, "kubernet": 10, "languag": 7, "latenc": 13, "launch": [6, 14], "length": 3, "likelihood": 3, "llama": 1, "local": 7, "make": 12, "max": 8, "mem": 8, "memori": [8, 13, 16], "method": [3, 10], "min": 13, "minor": 8, "modal": [7, 13], "model": [1, 6, 7, 11], "modelscop": 1, "more": 7, "multi": [7, 13], "new": [11, 13], "normal": [3, 13], "note": 10, "nsight": 2, "openai": [1, 6, 7, 14], "option": 8, "other": 2, "out": 8, "packag": 12, "parallel": 7, "paramet": 13, "peak": 8, "penalti": 13, "perform": [1, 13], "pip": 10, "polici": 8, "port": 11, "prefil": 8, "presenc": 13, "preview": 0, "process": 12, "profil": 2, "pypi": 12, "quick": [1, 7, 14], "refer": 9, "releas": 12, "repetit": 13, "request": [8, 14], "role": 7, "run": [1, 8, 10, 15], "runner": 15, "runtim": [1, 5, 13], "sampl": 13, "schedul": 8, "select": 3, "self": 15, "send": 14, "serv": 0, "server": [1, 6, 14, 16], "set": 15, "sglang": [0, 1, 3, 5, 7, 9, 10, 11, 13], "sh": 15, "size": 8, "skypilot": 10, "sourc": 10, "speed": 8, "srt": 1, "start": [1, 7, 9, 14, 15], "static": 8, "step": 15, "stream": [7, 13], "structur": 7, "submiss": 8, "suit": 11, "support": [1, 11], "templat": 5, "test": [4, 11], "throughput": 8, "tip": [2, 7], "togeth": 13, "token": [3, 13], "tp": 8, "troubleshoot": 16, "try": 8, "tune": 8, "tutori": 9, "uncondit": 3, "unit": 4, "up": 15, "updat": 12, "upload": 12, "us": [1, 6, 7, 10, 14], "version": 12, "vllm": 11, "wa": 16, "websit": 0, "without": 1, "your": [4, 8]}})
\ No newline at end of file
diff --git a/send_request.html b/send_request.html
index 6fe55df..f7fdb57 100644
--- a/send_request.html
+++ b/send_request.html
@@ -256,7 +256,7 @@
       
       
       
-      <li><a href="https://github.com/sgl-project/sglang/blob/main/docs/en/send_request.ipynb?plain=1" target="_blank"
+      <li><a href="https://github.com/sgl-project/sglang/blob/main/send_request.ipynb?plain=1" target="_blank"
    class="btn btn-sm btn-source-file-button dropdown-item"
    title="Show source"
    data-bs-placement="left" data-bs-toggle="tooltip"
@@ -273,7 +273,7 @@
       
       
       
-      <li><a href="https://github.com/sgl-project/sglang/edit/main/docs/en/send_request.ipynb" target="_blank"
+      <li><a href="https://github.com/sgl-project/sglang/edit/main/send_request.ipynb" target="_blank"
    class="btn btn-sm btn-source-edit-button dropdown-item"
    title="Suggest edit"
    data-bs-placement="left" data-bs-toggle="tooltip"
diff --git a/setup_github_runner.html b/setup_github_runner.html
index ac51c9a..94dc674 100644
--- a/setup_github_runner.html
+++ b/setup_github_runner.html
@@ -253,7 +253,7 @@
       
       
       
-      <li><a href="https://github.com/sgl-project/sglang/blob/main/docs/en/setup_github_runner.md?plain=1" target="_blank"
+      <li><a href="https://github.com/sgl-project/sglang/blob/main/setup_github_runner.md?plain=1" target="_blank"
    class="btn btn-sm btn-source-file-button dropdown-item"
    title="Show source"
    data-bs-placement="left" data-bs-toggle="tooltip"
@@ -270,7 +270,7 @@
       
       
       
-      <li><a href="https://github.com/sgl-project/sglang/edit/main/docs/en/setup_github_runner.md" target="_blank"
+      <li><a href="https://github.com/sgl-project/sglang/edit/main/setup_github_runner.md" target="_blank"
    class="btn btn-sm btn-source-edit-button dropdown-item"
    title="Suggest edit"
    data-bs-placement="left" data-bs-toggle="tooltip"
diff --git a/troubleshooting.html b/troubleshooting.html
index 6bbdff0..8841c8c 100644
--- a/troubleshooting.html
+++ b/troubleshooting.html
@@ -252,7 +252,7 @@
       
       
       
-      <li><a href="https://github.com/sgl-project/sglang/blob/main/docs/en/troubleshooting.md?plain=1" target="_blank"
+      <li><a href="https://github.com/sgl-project/sglang/blob/main/troubleshooting.md?plain=1" target="_blank"
    class="btn btn-sm btn-source-file-button dropdown-item"
    title="Show source"
    data-bs-placement="left" data-bs-toggle="tooltip"
@@ -269,7 +269,7 @@
       
       
       
-      <li><a href="https://github.com/sgl-project/sglang/edit/main/docs/en/troubleshooting.md" target="_blank"
+      <li><a href="https://github.com/sgl-project/sglang/edit/main/troubleshooting.md" target="_blank"
    class="btn btn-sm btn-source-edit-button dropdown-item"
    title="Suggest edit"
    data-bs-placement="left" data-bs-toggle="tooltip"
@@ -431,7 +431,7 @@ <h2>CUDA error: an illegal memory access was encountered<a class="headerlink" hr
 <p>This error may be due to kernel errors or out-of-memory issues.</p>
 <ul class="simple">
 <li><p>If it is a kernel error, it is not easy to fix.</p></li>
-<li><p>If it is out-of-memory, sometimes it will report this error instead of “Out-of-memory.” In this case, try setting a smaller value for <code class="docutils literal notranslate"><span class="pre">--mem-fraction-static</span></code>. The default value of <code class="docutils literal notranslate"><span class="pre">--mem-fraction-static</span></code> is around 0.8 - 0.9. https://github.com/sgl-project/sglang/blob/1edd4e07d6ad52f4f63e7f6beaa5987c1e1cf621/python/sglang/srt/server_args.py#L92-L102</p></li>
+<li><p>If it is out-of-memory, sometimes it will report this error instead of “Out-of-memory.” In this case, try setting a smaller value for <code class="docutils literal notranslate"><span class="pre">--mem-fraction-static</span></code>. The default value of <code class="docutils literal notranslate"><span class="pre">--mem-fraction-static</span></code> is around 0.8 - 0.9.</p></li>
 </ul>
 </section>
 <section id="the-server-hangs">
@@ -439,7 +439,7 @@ <h2>The server hangs<a class="headerlink" href="#the-server-hangs" title="Link t
 <p>If the server hangs, try disabling some optimizations when launching the server.</p>
 <ul class="simple">
 <li><p>Add <code class="docutils literal notranslate"><span class="pre">--disable-cuda-graph</span></code>.</p></li>
-<li><p>Add <code class="docutils literal notranslate"><span class="pre">--disable-flashinfer-sampling</span></code>.</p></li>
+<li><p>Add <code class="docutils literal notranslate"><span class="pre">--sampling-backend</span> <span class="pre">pytorch</span></code>.</p></li>
 </ul>
 </section>
 </section>