feat: harbor boost, yapf, bench judge prompts, bench logger + ETA,

fix: bench params, chore: cleanup
av · Sep 22, 2024 · 387f806 · 387f806
1 parent 3d3ce7f
commit 387f806
Show file tree

Hide file tree

Showing 48 changed files with 1,793 additions and 260 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,12 @@
 # Local .env
 .env
 
+# Python
+.venv
+
+# Node
+node_modules/
+
 # Open WebUI
 open-webui/*
 open-webui/config.json
@@ -82,4 +88,7 @@ jupyter/workspace/*
 !jupyter/workspace/000-sample.ipynb
 
 # History
-.history
+.history
+
+# Boost
+boost/src/**/__pycache__/
diff --git a/.style.yapf b/.style.yapf
@@ -0,0 +1,6 @@
+[style]
+based_on_style = google
+indent_width = 2
+continuation_indent_width = 2
+spaces_before_comment = 4
+dedent_closing_brackets=true
diff --git a/README.md b/README.md
@@ -18,7 +18,7 @@ Harbor is a containerized LLM toolkit that allows you to run LLMs and additional
 
 ##### Satellites
 
-[SearXNG](https://github.com/av/harbor/wiki/2.3.1-Satellite:-SearXNG) ⦁︎ [Perplexica](https://github.com/av/harbor/wiki/2.3.2-Satellite:-Perplexica) ⦁︎ [Dify](https://github.com/av/harbor/wiki/2.3.3-Satellite:-Dify) ⦁︎ [Plandex](https://github.com/av/harbor/wiki/2.3.4-Satellite:-Plandex) ⦁︎ [LiteLLM](https://github.com/av/harbor/wiki/2.3.5-Satellite:-LiteLLM) ⦁︎ [LangFuse](https://github.com/av/harbor/wiki/2.3.6-Satellite:-langfuse) ⦁︎ [Open Interpreter](https://github.com/av/harbor/wiki/2.3.7-Satellite:-Open-Interpreter) ⦁︎ [cloudflared](https://github.com/av/harbor/wiki/2.3.8-Satellite:-cloudflared) ⦁︎ [cmdh](https://github.com/av/harbor/wiki/2.3.9-Satellite:-cmdh) ⦁︎ [fabric](https://github.com/av/harbor/wiki/2.3.10-Satellite:-fabric) ⦁︎ [txtai RAG](https://github.com/av/harbor/wiki/2.3.11-Satellite:-txtai-RAG) ⦁︎ [TextGrad](https://github.com/av/harbor/wiki/2.3.12-Satellite:-TextGrad) ⦁︎ [Aider](https://github.com/av/harbor/wiki/2.3.13-Satellite:-aider) ⦁︎ [aichat](https://github.com/av/harbor/wiki/2.3.14-Satellite:-aichat) ⦁︎ [omnichain](https://github.com/av/harbor/wiki/2.3.16-Satellite:-omnichain) ⦁︎ [Harbor Bench](https://github.com/av/harbor/wiki/5.-Harbor-Bench.md) ⦁︎ [lm-evaluation-harness](https://github.com/av/harbor/wiki/2.3.17-Satellite:-lm-evaluation-harness) ⦁︎ [JupyterLab](https://github.com/av/harbor/wiki/2.3.18-Satellite:-JupyterLab) ⦁︎ [ol1](https://github.com/av/harbor/wiki/2.3.19-Satellite:-ol1)
+[Harbor Bench](https://github.com/av/harbor/wiki/5.1.-Harbor-Bench.md) ⦁︎ [Harbor Bench](https://github.com/av/harbor/wiki/5.2.-Harbor-Boost.md) ⦁︎ [SearXNG](https://github.com/av/harbor/wiki/2.3.1-Satellite:-SearXNG) ⦁︎ [Perplexica](https://github.com/av/harbor/wiki/2.3.2-Satellite:-Perplexica) ⦁︎ [Dify](https://github.com/av/harbor/wiki/2.3.3-Satellite:-Dify) ⦁︎ [Plandex](https://github.com/av/harbor/wiki/2.3.4-Satellite:-Plandex) ⦁︎ [LiteLLM](https://github.com/av/harbor/wiki/2.3.5-Satellite:-LiteLLM) ⦁︎ [LangFuse](https://github.com/av/harbor/wiki/2.3.6-Satellite:-langfuse) ⦁︎ [Open Interpreter](https://github.com/av/harbor/wiki/2.3.7-Satellite:-Open-Interpreter) ⦁︎ [cloudflared](https://github.com/av/harbor/wiki/2.3.8-Satellite:-cloudflared) ⦁︎ [cmdh](https://github.com/av/harbor/wiki/2.3.9-Satellite:-cmdh) ⦁︎ [fabric](https://github.com/av/harbor/wiki/2.3.10-Satellite:-fabric) ⦁︎ [txtai RAG](https://github.com/av/harbor/wiki/2.3.11-Satellite:-txtai-RAG) ⦁︎ [TextGrad](https://github.com/av/harbor/wiki/2.3.12-Satellite:-TextGrad) ⦁︎ [Aider](https://github.com/av/harbor/wiki/2.3.13-Satellite:-aider) ⦁︎ [aichat](https://github.com/av/harbor/wiki/2.3.14-Satellite:-aichat) ⦁︎ [omnichain](https://github.com/av/harbor/wiki/2.3.16-Satellite:-omnichain) ⦁︎ [lm-evaluation-harness](https://github.com/av/harbor/wiki/2.3.17-Satellite:-lm-evaluation-harness) ⦁︎ [JupyterLab](https://github.com/av/harbor/wiki/2.3.18-Satellite:-JupyterLab) ⦁︎ [ol1](https://github.com/av/harbor/wiki/2.3.19-Satellite:-ol1)
 
 ## Blitz Tour
 
@@ -40,6 +40,10 @@ harbor up llamacpp tgi litellm vllm tabbyapi aphrodite sglang ktransformers
 # Run different Frontends
 harbor up librechat chatui bionicgpt hollama
 
+# Get a free quality boost with
+# built-in optimizing proxy
+harbor up boost
+
 # Use FLUX in Open WebUI in one command
 harbor up comfyui
 
@@ -184,8 +188,10 @@ harbor open
   Read about supported services and the ways to configure them.
 - [Compatibility](https://github.com/av/harbor/wiki/4.-Compatibility)<br/>
   Known compatibility issues between the services and models as well as possible workarounds.
-- [Harbor Bench](https://github.com/av/harbor/wiki/5.-Harbor-Bench)<br/>
-  Documentation on built-in LLM benchmarking service.
+- [Harbor Bench](https://github.com/av/harbor/wiki/5.1.-Harbor-Bench)<br/>
+  Documentation for the built-in LLM benchmarking service.
+- [Harbor Boost](https://github.com/av/harbor/wiki/5.2.-Harbor-Boost)<br/>
+  Documentation for the built-in LLM optimiser proxy.
 - [Harbor Compose Setup](https://github.com/av/harbor/wiki/6.-Harbor-Compose-Setup)<br/>
   Read about the way Harbor uses Docker Compose to manage services.
 - [Adding A New Service](https://github.com/av/harbor/wiki/7.-Adding-A-New-Service)<br/>

diff --git a/bench/src/bench.ts b/bench/src/bench.ts
@@ -1,8 +1,9 @@
 import { config } from "./config.ts";
 import { BenchRunner } from "./runner.ts";
+import { log } from "./log.ts";
 
 async function main() {
-  console.log(`
+  log(`
 ░█▀▄░█▀▀░█▀█░█▀▀░█░█
 ░█▀▄░█▀▀░█░█░█░░░█▀█
 ░▀▀░░▀▀▀░▀░▀░▀▀▀░▀░▀
@@ -16,7 +17,7 @@ async function main() {
 }
 
 async function handleSignal() {
-  console.info("Interrupted");
+  log("Interrupted");
   Deno.exit(0);
 }
 

diff --git a/bench/src/config.ts b/bench/src/config.ts
@@ -23,7 +23,9 @@ export const config = {
         model: Deno.env.get('HARBOR_BENCH_JUDGE'),
         apiUrl: Deno.env.get('HARBOR_BENCH_JUDGE_API'),
         apiKey: Deno.env.get('HARBOR_BENCH_JUDGE_API_KEY'),
+        prompt: Deno.env.get('HARBOR_BENCH_JUDGE_PROMPT') ?? 'default',
         temperature: 0,
+        seed: 42,
     } as LLMConfig,
 };
 

diff --git a/bench/src/deps.ts b/bench/src/deps.ts
@@ -2,4 +2,6 @@ export * as args from "jsr:@std/cli/parse-args";
 export * as log from "jsr:@std/log";
 export * as csv from "jsr:@std/csv";
 export * as yaml from "jsr:@std/yaml";
-export * as path from "jsr:@std/path";
+export * as path from "jsr:@std/path";
+
+export { default as chalk } from "https://deno.land/x/[email protected]/source/index.js"
diff --git a/bench/src/judge.ts b/bench/src/judge.ts
@@ -37,9 +37,6 @@ Question:
 Answer: Paris
 Criterion: Answer mentions Paris
 Correct response: No
-
-
-
 </instructions>
 
 <question>
@@ -53,4 +50,59 @@ ${answer}
 <criteria>
 ${criteria}
 </criteria>
-`;
+`.trim();
+
+/**
+ * This is specific format tailored for the
+ * https://huggingface.co/flowaicom/Flow-Judge-v0.1
+ */
+export const flow = ({
+    question,
+    answer,
+    criteria,
+}) => `
+# GOAL
+Your job is to evaluate a task carried out by an AI system powered by a large language model.
+You will be provided with the inputs and output of the task, as well as the evaluation criteria and scoring rubric. Your task is to evaluate the output of the AI system based on the evaluation criteria and scoring rubric provided.
+# INPUT/s
+Below are the inputs required for performing the task:
+<inputs>
+<question>
+${question}
+</question>
+</inputs>
+
+# OUTPUT
+Below is the output of the task:
+<output>
+${answer}
+</output>
+
+# EVALUATION CRITERIA AND SCORING RUBRIC
+Here are the evaluation criteria and the rubric that you need to use for evaluating the task:
+<evaluation_criteria>
+${criteria}
+</evaluation_criteria>
+<scoring_rubric>
+- Score 0: The answer does not meets the criteria or only meets it partially.
+- Score 1: The answer fully meets the criteria.
+</scoring_rubric>
+
+# INSTRUCTIONS FOR THE EVALUATION
+1. Understand the task and criteria: Familiarize yourself with the task to be evaluated. Review the evaluation criteria and scoring rubric to understand the different levels of performance and the descriptions for each score.
+2. Review the inputs and output: Look at the inputs provided for the task. Examine the output generated from completing the task.
+3. Compare output to score descriptions: Compare the output against the criteria and score descriptions in the scoring rubric. For each criterion,decide which description best matches the output.
+4. After comparing the output to the score descriptions, pay attention to the small details that might impact the final score that you assign. Sometimes a small difference can dictate the final score.
+5. Write verbal feedback justifying your evaluation that includes a detailed rationale, referring to specific aspects of the output and comparing them to the rubric.
+6. Assign a final score based on the scoring rubric.
+
+## FORMAT FOR THE EVALUATION
+- Write the verbal feedback inside <feedback> tags without any additional surrounding text.
+- Write the numeric score inside <score> tags, without any additional surrounding text and always after the feedback.
+Please accurately evaluate the task. Strictly adhere to the evaluation criteria and rubric.
+`.trim();
+
+export const prompts = {
+    default: prompt,
+    flow,
+};