From 53729b74acb89c272ab449bf9e0b55859a8430cb Mon Sep 17 00:00:00 2001
From: Hugo Larcher <hugo.larcher@huggingface.co>
Date: Fri, 16 Aug 2024 19:43:30 +0200
Subject: [PATCH] doc: Add metrics documentation and add a 'Reference' section
 (#2230)

* doc: Add metrics documentation and add a 'Reference' section

* doc: Add API reference

* doc: Refactor API reference

* fix: Message API link

* Bad rebase

* Moving the docs.

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
---
 .github/workflows/build_pr_documentation.yaml |   2 +-
 .pre-commit-config.yaml                       |   2 +-
 docs/source/_toctree.yml                      |  14 ++-
 .../api_reference.md}                         |  33 +++--
 .../launcher.md                               |   0
 docs/source/reference/metrics.md              |  30 +++++
 router/src/server.rs                          | 114 ++++++++++++++++++
 update_doc.py                                 |   2 +-
 8 files changed, 179 insertions(+), 18 deletions(-)
 rename docs/source/{messages_api.md => reference/api_reference.md} (84%)
 rename docs/source/{basic_tutorials => reference}/launcher.md (100%)
 create mode 100644 docs/source/reference/metrics.md

diff --git a/.github/workflows/build_pr_documentation.yaml b/.github/workflows/build_pr_documentation.yaml
index bf03bfdf362..a5ce39a5f5e 100644
--- a/.github/workflows/build_pr_documentation.yaml
+++ b/.github/workflows/build_pr_documentation.yaml
@@ -11,7 +11,7 @@ concurrency:
 
 jobs:
   build:
-    uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yaml@main
+    uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main
     with:
       commit_sha: ${{ github.event.pull_request.head.sha }}
       pr_number: ${{ github.event.number }}
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 6f5e685ea8e..0c8b6885483 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -5,7 +5,7 @@ repos:
     -   id: check-yaml
     -   id: end-of-file-fixer
     -   id: trailing-whitespace
-        exclude: docs/source/basic_tutorials/launcher.md
+        exclude: docs/source/reference/launcher.md
 -   repo: https://github.com/psf/black
     rev: 24.2.0
     hooks:
diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index e97c00aa260..f52fa2ec2a5 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -17,8 +17,6 @@
     title: Installation from source
   - local: supported_models
     title: Supported Models and Hardware
-  - local: messages_api
-    title: Messages API
   - local: architecture
     title: Internal Architecture
   - local: usage_statistics
@@ -33,8 +31,6 @@
     title: Serving Private & Gated Models
   - local: basic_tutorials/using_cli
     title: Using TGI CLI
-  - local: basic_tutorials/launcher
-    title: All TGI CLI options
   - local: basic_tutorials/non_core_models
     title: Non-core Model Serving
   - local: basic_tutorials/safety
@@ -48,6 +44,14 @@
   - local: basic_tutorials/train_medusa
     title: Train Medusa
   title: Tutorials
+- sections:
+  - local: reference/launcher
+    title: All TGI CLI options
+  - local: reference/metrics
+    title: Exported Metrics
+  - local: reference/api_reference
+    title: API Reference
+  title: Reference
 - sections:
   - local: conceptual/streaming
     title: Streaming
@@ -64,7 +68,7 @@
   - local: conceptual/speculation
     title: Speculation (Medusa, ngram)
   - local: conceptual/guidance
-    title: How Guidance Works (via outlines
+    title: How Guidance Works (via outlines)
   - local: conceptual/lora
     title: LoRA (Low-Rank Adaptation)
 
diff --git a/docs/source/messages_api.md b/docs/source/reference/api_reference.md
similarity index 84%
rename from docs/source/messages_api.md
rename to docs/source/reference/api_reference.md
index 250aaae2249..52043c80f8a 100644
--- a/docs/source/messages_api.md
+++ b/docs/source/reference/api_reference.md
@@ -1,17 +1,30 @@
-# Messages API
+# HTTP API Reference
 
-Text Generation Inference (TGI) now supports the Messages API, which is fully compatible with the OpenAI Chat Completion API. This feature is available starting from version 1.4.0. You can use OpenAI's client libraries or third-party libraries expecting OpenAI schema to interact with TGI's Messages API. Below are some examples of how to utilize this compatibility.
+#### Table of Contents
 
-> **Note:** The Messages API is supported from TGI version 1.4.0 and above. Ensure you are using a compatible version to access this feature.
+- [Text Generation Inference custom API](#text-generation-inference-custom-api)
+- [OpenAI Messages API](#openai-messages-api)
+  - [Making a Request](#making-a-request)
+  - [Streaming](#streaming)
+  - [Synchronous](#synchronous)
+  - [Hugging Face Inference Endpoints](#hugging-face-inference-endpoints)
+  - [Cloud Providers](#cloud-providers)
+      - [Amazon SageMaker](#amazon-sagemaker)
 
-#### Table of Contents
+The HTTP API is a RESTful API that allows you to interact with the text-generation-inference component. Two endpoints are available:
+* Text Generation Inference [custom API](https://huggingface.github.io/text-generation-inference/)
+* OpenAI's [Messages API](#openai-messages-api)
+
+
+## Text Generation Inference custom API
 
-- [Making a Request](#making-a-request)
-- [Streaming](#streaming)
-- [Synchronous](#synchronous)
-- [Hugging Face Inference Endpoints](#hugging-face-inference-endpoints)
-- [Cloud Providers](#cloud-providers)
-  - [Amazon SageMaker](#amazon-sagemaker)
+Check the [API documentation](https://huggingface.github.io/text-generation-inference/) for more information on how to interact with the Text Generation Inference API.
+
+## OpenAI Messages API
+
+Text Generation Inference (TGI) now supports the Messages API, which is fully compatible with the OpenAI Chat Completion API. This feature is available starting from version 1.4.0. You can use OpenAI's client libraries or third-party libraries expecting OpenAI schema to interact with TGI's Messages API. Below are some examples of how to utilize this compatibility.
+
+> **Note:** The Messages API is supported from TGI version 1.4.0 and above. Ensure you are using a compatible version to access this feature.
 
 ## Making a Request
 
diff --git a/docs/source/basic_tutorials/launcher.md b/docs/source/reference/launcher.md
similarity index 100%
rename from docs/source/basic_tutorials/launcher.md
rename to docs/source/reference/launcher.md
diff --git a/docs/source/reference/metrics.md b/docs/source/reference/metrics.md
new file mode 100644
index 00000000000..d34d38eab7d
--- /dev/null
+++ b/docs/source/reference/metrics.md
@@ -0,0 +1,30 @@
+# Metrics
+
+TGI exposes multiple metrics that can be collected via the `/metrics` Prometheus endpoint.
+These metrics can be used to monitor the performance of TGI, autoscale deployment and to help identify bottlenecks.
+
+The following metrics are exposed:
+
+| Metric Name                                | Description                                                                              | Type      | Unit    |
+|--------------------------------------------|------------------------------------------------------------------------------------------|-----------|---------|
+| `tgi_batch_current_max_tokens`             | Maximum tokens for the current batch                                                     | Gauge     | Count   |
+| `tgi_batch_current_size`                   | Current batch size                                                                       | Gauge     | Count   |
+| `tgi_batch_decode_duration`                | Time spent decoding a batch per method (prefill or decode)                               | Histogram | Seconds |
+| `tgi_batch_filter_duration`                | Time spent filtering batches and sending generated tokens per method (prefill or decode) | Histogram | Seconds |
+| `tgi_batch_forward_duration`               | Batch forward duration per method (prefill or decode)                                    | Histogram | Seconds |
+| `tgi_batch_inference_count`                | Inference calls per method (prefill or decode)                                           | Counter   | Count   |
+| `tgi_batch_inference_duration`             | Batch inference duration                                                                 | Histogram | Seconds |
+| `tgi_batch_inference_success`              | Number of successful inference calls per method (prefill or decode)                      | Counter   | Count   |
+| `tgi_batch_next_size`                      | Batch size of the next batch                                                             | Histogram | Count   |
+| `tgi_queue_size`                           | Current queue size                                                                       | Gauge     | Count   |
+| `tgi_request_count`                        | Total number of requests                                                                 | Counter   | Count   |
+| `tgi_request_duration`                     | Total time spent processing the request (e2e latency)                                    | Histogram | Seconds |
+| `tgi_request_generated_tokens`             | Generated tokens per request                                                             | Histogram | Count   |
+| `tgi_request_inference_duration`           | Request inference duration                                                               | Histogram | Seconds |
+| `tgi_request_input_length`                 | Input token length per request                                                           | Histogram | Count   |
+| `tgi_request_max_new_tokens`               | Maximum new tokens per request                                                           | Histogram | Count   |
+| `tgi_request_mean_time_per_token_duration` | Mean time per token per request (inter-token latency)                                    | Histogram | Seconds |
+| `tgi_request_queue_duration`               | Time spent in the queue per request                                                      | Histogram | Seconds |
+| `tgi_request_skipped_tokens`               | Speculated tokens per request                                                            | Histogram | Count   |
+| `tgi_request_success`                      | Number of successful requests                                                            | Counter   |         |
+| `tgi_request_validation_duration`          | Time spent validating the request                                                        | Histogram | Seconds |
diff --git a/router/src/server.rs b/router/src/server.rs
index ab268efa2f7..8ec7a8716ed 100644
--- a/router/src/server.rs
+++ b/router/src/server.rs
@@ -2003,6 +2003,120 @@ async fn start(
         .install_recorder()
         .expect("failed to install metrics recorder");
 
+    // Metrics descriptions
+    metrics::describe_counter!("tgi_request_success", "Number of successful requests");
+    metrics::describe_histogram!(
+        "tgi_request_duration",
+        metrics::Unit::Seconds,
+        "Request duration"
+    );
+    metrics::describe_histogram!(
+        "tgi_request_validation_duration",
+        metrics::Unit::Seconds,
+        "Request validation duration"
+    );
+    metrics::describe_histogram!(
+        "tgi_request_queue_duration",
+        metrics::Unit::Seconds,
+        "Request queue duration"
+    );
+    metrics::describe_histogram!(
+        "tgi_request_inference_duration",
+        metrics::Unit::Seconds,
+        "Request inference duration"
+    );
+    metrics::describe_histogram!(
+        "tgi_request_mean_time_per_token_duration",
+        metrics::Unit::Seconds,
+        "Mean time per token per request"
+    );
+    metrics::describe_histogram!(
+        "tgi_request_generated_tokens",
+        metrics::Unit::Count,
+        "Generated tokens per request"
+    );
+    metrics::describe_counter!(
+        "tgi_batch_inference_count",
+        metrics::Unit::Count,
+        "Inference calls per method (prefill or decode)"
+    );
+    metrics::describe_counter!(
+        "tgi_request_count",
+        metrics::Unit::Count,
+        "Total number of requests"
+    );
+    metrics::describe_counter!(
+        "tgi_batch_inference_success",
+        metrics::Unit::Count,
+        "Number of successful inference calls per method (prefill or decode)"
+    );
+    metrics::describe_gauge!(
+        "tgi_batch_current_size",
+        metrics::Unit::Count,
+        "Current batch size"
+    );
+    metrics::describe_gauge!("tgi_queue_size", metrics::Unit::Count, "Current queue size");
+    metrics::describe_gauge!(
+        "tgi_batch_current_max_tokens",
+        metrics::Unit::Count,
+        "Maximum tokens for the current batch"
+    );
+    metrics::describe_histogram!(
+        "tgi_request_max_new_tokens",
+        metrics::Unit::Count,
+        "Maximum new tokens per request"
+    );
+    metrics::describe_histogram!(
+        "tgi_batch_inference_duration",
+        metrics::Unit::Seconds,
+        "Batch inference duration"
+    );
+    metrics::describe_histogram!(
+        "tgi_batch_forward_duration",
+        metrics::Unit::Seconds,
+        "Batch forward duration per method (prefill or decode)"
+    );
+    metrics::describe_histogram!(
+        "tgi_request_skipped_tokens",
+        metrics::Unit::Count,
+        "Speculated tokens per request"
+    );
+    metrics::describe_histogram!(
+        "tgi_batch_filter_duration",
+        metrics::Unit::Seconds,
+        "Time spent filtering batches and sending generated tokens per method (prefill or decode)"
+    );
+    metrics::describe_histogram!(
+        "tgi_request_queue_duration",
+        metrics::Unit::Seconds,
+        "Time spent in the queue per request"
+    );
+    metrics::describe_histogram!(
+        "tgi_request_validation_duration",
+        metrics::Unit::Seconds,
+        "Time spent validating the request"
+    );
+    metrics::describe_histogram!(
+        "tgi_request_duration",
+        metrics::Unit::Seconds,
+        "Total time spent processing the request"
+    );
+    metrics::describe_histogram!(
+        "tgi_batch_decode_duration",
+        metrics::Unit::Seconds,
+        "Time spent decoding a batch per method (prefill or decode)"
+    );
+    metrics::describe_histogram!(
+        "tgi_request_input_length",
+        metrics::Unit::Count,
+        "Input token length per request"
+    );
+    metrics::describe_histogram!(
+        "tgi_batch_next_size",
+        metrics::Unit::Count,
+        "Batch size of the next batch"
+    );
+
     // CORS layer
     let allow_origin = allow_origin.unwrap_or(AllowOrigin::any());
     let cors_layer = CorsLayer::new()
diff --git a/update_doc.py b/update_doc.py
index e887e1c6dc0..3fb0d314305 100644
--- a/update_doc.py
+++ b/update_doc.py
@@ -63,7 +63,7 @@ def check_cli(check: bool):
     final_doc += f"## {header}\n```shell\n{rendered_block}\n```\n"
     block = []
 
-    filename = "docs/source/basic_tutorials/launcher.md"
+    filename = "docs/source/reference/launcher.md"
     if check:
         with open(filename, "r") as f:
             doc = f.read()