Merge pull request #121 from bytegorilla/main

Increase TPS to 4 digits
fixie-ai · Sep 5, 2024 · 30d8fe0 · 30d8fe0
2 parents 495accf + 92a5dec
commit 30d8fe0
Show file tree

Hide file tree

Showing 4 changed files with 9 additions and 5 deletions.
diff --git a/README.md b/README.md
@@ -10,6 +10,7 @@ Large Language Models (LLMs):
 - Llama2 and 3 from several different providers, including
   - Anyscale
   - Azure
+  - Cerebras
   - Cloudflare
   - Groq
   - OctoAI

diff --git a/llm_benchmark.py b/llm_benchmark.py
@@ -273,7 +273,7 @@ def on_token(ctx: llm_request.ApiContext, token: str):
         assert r.output
         minimal_output = r.error or r.output.replace("\n", "\\n").strip()[:64]
         print(
-            f"| {r.model:42} | {r.ttr:4.2f} | {r.ttft:4.2f} | {r.tps:3.0f} "
+            f"| {r.model:42} | {r.ttr:4.2f} | {r.ttft:4.2f} | {r.tps:4.0f} "
             f"| {r.output_tokens:3} | {r.total_time:5.2f} | {minimal_output} |"
         )
     elif args.format == "json":

diff --git a/llm_benchmark_suite.py b/llm_benchmark_suite.py
@@ -579,9 +579,9 @@ def _format_response(
         return response.to_json(indent=2), "application/json"
     else:
         s = (
-            "| Provider/Model                             | TTR  | TTFT | TPS | ITk  | OTk | ITim | OTim | Total |"
+            "| Provider/Model                             | TTR  | TTFT | TPS  | ITk  | OTk | ITim | OTim | Total |"
             f" {'Response':{dlen}.{dlen}} |\n"
-            "| :----------------------------------------- | ---: | ---: | --: | ---: | --: | ---: | ---: | ----: |"
+            "| :----------------------------------------- | ---: | ---: | ---: | ---: | --: | ---: | ---: | ----: |"
             f" {':--':-<{dlen}.{dlen}} |\n"
         )
 
@@ -600,7 +600,7 @@ def _format_response(
             total_time = r.total_time or 0.0
             output = (r.error or r.output).strip().replace("\n", "\\n")
             s += (
-                f"| {r.model[:42]:42} | {ttr:4.2f} | {ttft:4.2f} | {tps:3.0f} "
+                f"| {r.model[:42]:42} | {ttr:4.2f} | {ttft:4.2f} | {tps:4.0f} "
                 f"| {in_tokens:4} | {out_tokens:3} | {in_time:4.2f} | {out_time:4.2f} "
                 f"| {total_time:5.2f} | {output:{dlen}.{dlen}} |\n"
             )

diff --git a/llm_request.py b/llm_request.py
@@ -17,6 +17,7 @@
 ApiResult = Tuple[aiohttp.ClientResponse, TokenGenerator]
 
 AZURE_OPENAI_API_VERSION = "2024-02-15-preview"
+MAX_TPS = 9999
 MAX_TTFT = 9.99
 MAX_TOTAL_TIME = 99.99
 
@@ -145,7 +146,9 @@ async def run(self, on_token: Optional[Callable[["ApiContext", str], None]] = No
         if not self.metrics.error:
             token_time = end_time - first_token_time
             self.metrics.total_time = end_time - start_time
-            self.metrics.tps = min((self.metrics.output_tokens - 1) / token_time, 999)
+            self.metrics.tps = min((self.metrics.output_tokens - 1) / token_time, MAX_TPS)
+            if self.metrics.tps == MAX_TPS:
+                self.metrics.tps = 0.0
         else:
             self.metrics.ttft = MAX_TTFT
             self.metrics.tps = 0.0