From d26761f16b20d6cac80b56083689c4d7ad966ce4 Mon Sep 17 00:00:00 2001
From: Samuel Bushi <ssbushi@google.com>
Date: Fri, 13 Dec 2024 04:04:57 +0000
Subject: [PATCH] evals docs

---
 docs/evaluation.md                 | 117 +++++++++-----
 docs/plugin-authoring-evaluator.md | 251 +++++++++++------------------
 2 files changed, 177 insertions(+), 191 deletions(-)
diff --git a/docs/evaluation.md b/docs/evaluation.md
index 5102ec901..b4bb19721 100644
--- a/docs/evaluation.md
+++ b/docs/evaluation.md
@@ -18,19 +18,35 @@ Genkit faithfulness and answer relevancy metrics:
 ```ts
 import { genkit } from 'genkit';
 import { genkitEval, GenkitMetric } from '@genkit-ai/evaluator';
-import { vertexAI, textEmbedding004, gemini15Flash } from '@genkit-ai/vertexai';
+import { vertexAI, textEmbedding004, gemini15Pro } from '@genkit-ai/vertexai';
 
 const ai = genkit({
   plugins: [
     vertexAI(),
     genkitEval({
-      judge: gemini15Flash,
-      metrics: [GenkitMetric.FAITHFULNESS, GenkitMetric.ANSWER_RELEVANCY],
+      judge: gemini15Pro, // Use a powerful LLM as judge
+      metrics: [GenkitMetric.MALICIOUSNESS, GenkitMetric.ANSWER_RELEVANCY],
       embedder: textEmbedding004, // GenkitMetric.ANSWER_RELEVANCY requires an embedder
     }),
   ],
   // ...
 });
+
+// Define a simple menu suggestion flow
+export const menuSuggestionFlow = ai.defineFlow(
+  {
+    name: "menuSuggestionFlow",
+    inputSchema: z.string(),
+    outputSchema: z.string(),
+  },
+  async (query) => {
+    const llmResponse = await ai.generate({
+      model: gemini15Flash,
+      prompt: `Suggest a menu item using ${query}`,
+    });
+    return llmResponse.text;
+  }
+);
 ```
 
 **Note:** The configuration above requires installing the `genkit`,
@@ -74,6 +90,12 @@ input and reference output using this format instead:
 
 Note that you can use any JSON data type in the input JSON file. Genkit will pass them along with the same data type to your flow.
 
+You are now ready to start evaluating your flow. Begin by starting up your app with `genkit start`:
+
+```posix-terminal
+genkit start -- <command to start your app>
+```
+
 You can then use the `eval:flow` command to evaluate your flow against the test
 cases provided in `testInputs.json`.
 
@@ -87,13 +109,14 @@ If your flow requires auth, you may specify it using the `--auth` argument:
 genkit eval:flow menuSuggestionFlow --input testInputs.json --auth "{\"email_verified\": true}"
 ```
 
-You can then see evaluation results in the Developer UI by running:
+To run on a subset of the configured evaluators, use the `--evaluators` flag and
+provide a comma-separated list of evaluators by name:
 
 ```posix-terminal
-genkit start
+genkit eval:flow menuSuggestionFlow --input testInputs.json --evaluators=genkit/faithfulness,genkit/answer_relevancy
 ```
 
-Then navigate to `localhost:4000/evaluate`.
+Then navigate to `localhost:4000/evaluate` to view your evaluation results.
 
 Alternatively, you can provide an output file to inspect the output in a JSON
 file.
@@ -125,7 +148,7 @@ Genkit supports additional evaluators through plugins like the VertexAI Rapid Ev
 `eval:flow` is a convenient way to quickly evaluate the flow, but sometimes you
 might need more control over evaluation steps. This may occur if you are using a
 different framework and already have some output you would like to evaluate. You
-can perform all the steps that `eval:flow` performs semi-manually.
+can perform all the steps that `eval:flow` performs semi-automaticaly.
 
 You can batch run your Genkit flow and add a unique label to the run which then
 will be used to extract an evaluation dataset (a set of inputs, outputs, and
@@ -134,13 +157,13 @@ contexts).
 Run the flow over your test inputs:
 
 ```posix-terminal
-genkit flow:batchRun myRagFlow test_inputs.json --output flow_outputs.json --label customLabel
+genkit flow:batchRun menuSuggestionFlow testInputs.json --label myFavoriteFoods
 ```
 
 Extract the evaluation data:
 
 ```posix-terminal
-genkit eval:extractData myRagFlow --label customLabel --output customLabel_dataset.json
+genkit eval:extractData menuSuggestionFlow --label myFavoriteFoods --output myFavoriteFoods_dataset.json
 ```
 
 The exported data will be output as a JSON file with each testCase in the
@@ -159,7 +182,14 @@ following format:
 ```
 
 The data extractor will automatically locate retrievers and add the produced
-docs to the context array. By default, `eval:run` will run against all
+docs to the context array. You can run evaluation metrics on this extracted dataset using the `eval:run` 
+command.
+
+```posix-terminal
+genkit eval:run myFavoriteFoods_dataset.json
+```
+
+By default, `eval:run` will run against all
 configured evaluators, and like `eval:flow`, results for `eval:run` will appear
 in the evaluation page of Developer UI, located at `localhost:4000/evaluate`.
 
@@ -169,11 +199,40 @@ You can also provide custom extractors to be used in `eval:extractData` and
 `eval:flow` commands. Custom extractors allow you to override the default
 extraction logic giving you more power in creating datasets and evaluating them.
 
+Let us first introduce an auxilary step in our `menuSuggestionFlow` example:
+
+```js
+export const menuSuggestionFlow = ai.defineFlow(
+  {
+    name: "menuSuggestionFlow",
+    inputSchema: z.string(),
+    outputSchema: z.string(),
+  },
+  async (query) => {
+    const allergyItems = await run('allergyItems', async () => {
+        // You are allergic to these!
+        //
+        // This step is for demo purposes, think of it as something auxillary
+        // to your model input, eg: an API call that you would 
+        // include in the prompt, etc. 
+        return ["Shirmp", "Eggs", "Peanuts"];
+    });
+    const llmResponse = await ai.generate({
+      model: gemini15Flash,
+      prompt: `Suggest a menu item using ${query}. Assume that I am allergic to these ingredients ${allergyItems.join(', ')}`,
+    });
+    return llmResponse.text;
+  }
+);
+```
+
+Now let us configure a custom extractor to use the output of the `allergyItems` step when evaluating this flow.
+
 To configure custom extractors, add a tools config file named
 `genkit-tools.conf.js` to your project root if you don't have one already.
 
 ```posix-terminal
-cd $GENKIT_PROJECT_HOME
+cd /path/to/your/genkit/app
 
 touch genkit-tools.conf.js
 ```
@@ -184,19 +243,22 @@ In the tools config file, add the following code:
 module.exports = {
   evaluators: [
     {
-      actionRef: '/flow/myFlow',
+      actionRef: '/flow/menuSuggestionFlow',
       extractors: {
-        context: { outputOf: 'foo-step' },
-        output: 'bar-step',
+        context: { outputOf: 'allergyItems' },
       },
     },
   ],
 };
 ```
 
-In this sample, you configure an extractor for `myFlow` flow. The config
-overrides the extractors for `context` and `output` fields and uses the default
-logic for the `input` field.
+This config overrides the default extractors of Genkit's tooling, specifically changing what is considered as as `context` when evaluating this flow.
+
+You can run evaluation again and you will see that context is now populated as the output of the step.
+
+```posix-terminal
+genkit eval:flow menuSuggestionFlow --input testInputs.json
+```
 
 The specification of the evaluation extractors is as follows:
 
@@ -223,27 +285,6 @@ providing a function extractor, make sure that the output is a valid JSON
 string. For example: `"Hello, world!"` is not valid JSON; `"\"Hello, world!\""`
 is valid.
 
-### Running on existing datasets
-
-To run evaluation over an already extracted dataset:
-
-```posix-terminal
-genkit eval:run customLabel_dataset.json
-```
-
-To output to a different location, use the `--output` flag.
-
-```posix-terminal
-genkit eval:flow menuSuggestionFlow --input testInputs.json --output customLabel_evalresult.json
-```
-
-To run on a subset of the configured evaluators, use the `--evaluators` flag and
-provide a comma-separated list of evaluators by name:
-
-```posix-terminal
-genkit eval:run customLabel_dataset.json --evaluators=genkit/faithfulness,genkit/answer_relevancy
-```
-
 ### Synthesizing test data using an LLM
 
 Here's an example flow that uses a PDF file to generate possible questions users
diff --git a/docs/plugin-authoring-evaluator.md b/docs/plugin-authoring-evaluator.md
index 0e7a9b9ce..b0338c838 100644
--- a/docs/plugin-authoring-evaluator.md
+++ b/docs/plugin-authoring-evaluator.md
@@ -25,39 +25,45 @@ For this example, the prompt is going to ask the LLM to judge how delicious the
 Genkit’s `definePrompt` utility provides an easy way to define prompts with input and output validation. Here’s how you can set up an evaluation prompt with `definePrompt`.
 
 ```ts
+import { z } from "genkit";
+
 const DELICIOUSNESS_VALUES = ['yes', 'no', 'maybe'] as const;
 
 const DeliciousnessDetectionResponseSchema = z.object({
   reason: z.string(),
   verdict: z.enum(DELICIOUSNESS_VALUES),
 });
-type DeliciousnessDetectionResponse = z.infer<typeof DeliciousnessDetectionResponseSchema>;
 
-const DELICIOUSNESS_PROMPT = ai.definePrompt(
-  {
-    name: 'deliciousnessPrompt',
-    inputSchema: z.object({
-      output: z.string(),
-    }),
-    outputSchema: DeliciousnessDetectionResponseSchema,
-  },
-  `You are a food critic. Assess whether the provided output sounds delicious, giving only "yes" (delicious), "no" (not delicious), or "maybe" (undecided) as the verdict.
+function getDeliciousnessPrompt(ai: Genkit) {
+  return  ai.definePrompt({
+      name: 'deliciousnessPrompt',
+      input: {
+        schema: z.object({
+          responseToTest: z.string(),
+        }),
+      },
+      output: {
+        schema: DeliciousnessDetectionResponseSchema,
+      }
+    },
+    `You are a food critic. Assess whether the provided output sounds delicious, giving only "yes" (delicious), "no" (not delicious), or "maybe" (undecided) as the verdict.
 
-  Examples:
-  Output: Chicken parm sandwich
-  Response: { "reason": "A classic and beloved dish.", "verdict": "yes" }
+    Examples:
+    Output: Chicken parm sandwich
+    Response: { "reason": "A classic and beloved dish.", "verdict": "yes" }
 
-  Output: Boston Logan Airport tarmac
-  Response: { "reason": "Not edible.", "verdict": "no" }
+    Output: Boston Logan Airport tarmac
+    Response: { "reason": "Not edible.", "verdict": "no" }
 
-  Output: A juicy piece of gossip
-  Response: { "reason": "Metaphorically 'tasty' but not food.", "verdict": "maybe" }
+    Output: A juicy piece of gossip
+    Response: { "reason": "Metaphorically 'tasty' but not food.", "verdict": "maybe" }
 
-  New Output:
-  {{output}}
-  Response:
-  `
-);
+    New Output:
+    {{responseToTest}}
+    Response:
+    `
+  );
+}
 ```
 
 #### Define the scoring function
@@ -84,17 +90,17 @@ export async function deliciousnessScore<
     throw new Error('Output is required for Deliciousness detection');
   }
 
-  //Hydrate the prompt
-  const finalPrompt = DELICIOUSNESS_PROMPT.renderText({
-    output: d.output as string,
-  });
-
-  // Call the LLM to generate an evaluation result
-  const response = await ai.generate({
-    model: judgeLlm,
-    prompt: finalPrompt,
-    config: judgeConfig,
-  });
+  //Hydrate the prompt and generate an evaluation result
+  const deliciousnessPrompt = getDeliciousnessPrompt(ai);
+  const response = await deliciousnessPrompt(
+    {
+      responseToTest: d.output as string,
+    },
+    {
+      model: judgeLlm,
+      config: judgeConfig,
+    }
+  );
 
   // Parse the output
   const parsedResponse = response.output;
@@ -115,7 +121,7 @@ export async function deliciousnessScore<
 The final step is to write a function that defines the evaluator action itself.
 
 ```ts
-import { Genkit, ModelReference, z } from 'genkit';
+import { Genkit, z } from 'genkit';
 import { BaseEvalDataPoint, EvaluatorAction } from 'genkit/evaluator';
 
 /**
@@ -125,12 +131,12 @@ export function createDeliciousnessEvaluator<
   ModelCustomOptions extends z.ZodTypeAny,
 >(
   ai: Genkit,
-  judge: ModelReference<ModelCustomOptions>,
-  judgeConfig: z.infer<ModelCustomOptions>
+  judge: ModelArgument<ModelCustomOptions>,
+  judgeConfig?: z.infer<ModelCustomOptions>
 ): EvaluatorAction {
   return ai.defineEvaluator(
     {
-      name: `myAwesomeEval/deliciousness`,
+      name: `deliciousnessEvaluator`,
       displayName: 'Deliciousness',
       definition: 'Determines if output is considered delicous.',
       isBilled: true,
@@ -146,6 +152,13 @@ export function createDeliciousnessEvaluator<
 }
 ```
 
+<!-- TODO: Test out the deliciousness evaluator 
+
+
+export const deliciousness = createDeliciousnessEvaluator(ai, gemini15Pro);
+
+-->
+
 The `defineEvaluator` method is similar to other Genkit constructors like `defineFlow`, `defineRetriever` etc. The user should provide an `EvaluatorFn` to the `defineEvaluator` callback. The `EvaluatorFn` accepts a `BaseEvalDataPoint` which corresponds to a single entry in a dataset under evaluation, along with an optional custom options parameter if specified. The function, should process the datapoint and return an `EvalResponse` object. 
 
 Here are the Zod Schemas for `BaseEvalDataPoint` and `EvalResponse`:
@@ -204,10 +217,10 @@ Just like the LLM-based evaluator, define the scoring function. In this case, th
 import { BaseEvalDataPoint, Score } from 'genkit/evaluator';
 
 const US_PHONE_REGEX =
-  /^[\+]?[(]?[0-9]{3}[)]?[-\s\.]?[0-9]{3}[-\s\.]?[0-9]{4}$/i;
+  /[\+]?[(]?[0-9]{3}[)]?[-\s\.]?[0-9]{3}[-\s\.]?[0-9]{4}/i;
 
 /**
- * Scores whether an individual datapoint matches a US Phone Regex.
+ * Scores whether a datapoint output contains a US Phone number.
  */
 export async function usPhoneRegexScore(
   dataPoint: BaseEvalDataPoint
@@ -218,115 +231,72 @@ export async function usPhoneRegexScore(
   }
   const matches = US_PHONE_REGEX.test(d.output as string);
   const reasoning = matches
-    ? `Output matched regex ${regex.source}`
-    : `Output did not match regex ${regex.source}`;
+    ? `Output matched US_PHONE_REGEX`
+    : `Output did not match US_PHONE_REGEX`;
   return {
     score: matches,
     details: { reasoning },
   };
 }
-
-/**
- * Create an EvalResponse from an individual scored datapoint.
- */
-function fillScores(dataPoint: BaseEvalDataPoint, score: Score): EvalResponse {
-  return {
-    testCaseId: dataPoint.testCaseId,
-    evaluation: score,
-  };
-}
 ```
 
 #### Define the evaluator action
 
 ```ts
-import { BaseEvalDataPoint, EvaluatorAction } from 'genkit/evaluator';
+import { EvaluatorAction } from "genkit/evaluator";
+import { Genkit } from "genkit";
 
 /**
  * Configures a regex evaluator to match a US phone number.
  */
-export function createUSPhoneRegexEvaluator(
-  metrics: RegexMetric[]
-): EvaluatorAction[] {
-  return metrics.map((metric) => {
-    const regexMetric = metric as RegexMetric;
-    return defineEvaluator(
-      {
-        name: `myAwesomeEval/${metric.name.toLocaleLowerCase()}`,
-        displayName: 'Regex Match',
-        definition:
-          'Runs the output against a regex and responds with true if a match is found and false otherwise.',
-        isBilled: false,
-      },
-      async (datapoint: BaseEvalDataPoint) => {
-        const score = await usPhoneRegexScore(datapoint);
-        return fillScores(datapoint, score);
-      }
-    );
-  });
+export function createUSPhoneRegexEvaluator(ai: Genkit): EvaluatorAction {
+  return ai.defineEvaluator(
+    {
+      name: `usPhoneRegexEvaluator`,
+      displayName: "Regex Match for US PHONE NUMBER",
+      definition: "Uses Regex to check if output matches a US phone number",
+      isBilled: false,
+    },
+    async (datapoint: BaseEvalDataPoint) => {
+      const score = await usPhoneRegexScore(datapoint);
+      return {
+        testCaseId: datapoint.testCaseId,
+        evaluation: score,
+      };
+    }
+  );
 }
 ```
 
 ## Configuration
 
-### Plugin Options
-
-Define the `PluginOptions` that the custom evaluator plugin will use. This object has no strict requirements and is dependent on the types of evaluators that are defined.
-
-At a minimum it will need to take the definition of which metrics to register.
-
-```ts
-export enum MyAwesomeMetric {
-  WORD_COUNT = 'WORD_COUNT',
-  US_PHONE_REGEX_MATCH = 'US_PHONE_REGEX_MATCH',
-}
-
-export interface PluginOptions {
-  metrics?: Array<MyAwesomeMetric>;
-}
-```
-
-If this new plugin uses an LLM as a judge and the plugin supports swapping out which LLM to use, define additional parameters in the `PluginOptions` object.
-
-```ts
-export interface PluginOptions<ModelCustomOptions extends z.ZodTypeAny> {
-  judge: ModelReference<ModelCustomOptions>;
-  judgeConfig?: z.infer<ModelCustomOptions>;
-  metrics?: Array<MyAwesomeMetric>;
-}
-```
-
 ### Plugin definition
 
-Plugins are registered with the framework via the `genkit.config.ts` file in a project. To be able to configure a new plugin, define a function that defines a `GenkitPlugin` and configures it with the `PluginOptions` defined above.
+Plugins are registered with the framework by installing them at the tine of initalizing the Genkit object in your application. To be able to define a new plugin, use the `genkitPlugin` helper method to instantiate all Genkit Actions within the plugin context.
 
 In this case we have two evaluators `DELICIOUSNESS` and `US_PHONE_REGEX_MATCH`. This is where those evaluators are registered with the plugin and with Firebase Genkit.
 
 ```ts
 import { GenkitPlugin, genkitPlugin } from 'genkit/plugin';
 
-export function myAwesomeEval<ModelCustomOptions extends z.ZodTypeAny>(
-  options: PluginOptions<ModelCustomOptions>
-): GenkitPlugin {
+export function myCustomEvals<
+  ModelCustomOptions extends z.ZodTypeAny
+>(options: {
+  judge: ModelArgument<ModelCustomOptions>;
+  judgeConfig?: ModelCustomOptions;
+}): GenkitPlugin {
   // Define the new plugin
-    return genkitPlugin(
-    'myAwesomeEval',
-    async (ai: Genkit) => {
-      const { judge, judgeConfig, metrics } = options;
-      const evaluators: EvaluatorAction[] = metrics.map((metric) => {
-        switch (metric) {
-          case DELICIOUSNESS:
-            // This evaluator requires an LLM as judge
-            return createDeliciousnessEvaluator(ai, judge, judgeConfig);
-          case US_PHONE_REGEX_MATCH:
-            // This evaluator does not require an LLM
-            return createUSPhoneRegexEvaluator();
-        }
-      });
-      return { evaluators };
-    });
+  return genkitPlugin("myCustomEvals", async (ai: Genkit) => {
+    const { judge, judgeConfig } = options;
+
+    // The plugin instatiates our custom evaluators within the context
+    // of the `ai` object, making them available
+    // throughout our Genkit application.
+    createDeliciousnessEvaluator(ai, judge, judgeConfig);
+    createUSPhoneRegexEvaluator(ai);
+  });
 }
-export default myAwesomeEval;
+export default myCustomEvals;
 ```
 
 ### Configure Genkit
@@ -336,50 +306,25 @@ Add the newly defined plugin to your Genkit configuration.
 For evaluation with Gemini, disable safety settings so that the evaluator can accept, detect, and score potentially harmful content.
 
 ```ts
-import { gemini15Flash } from '@genkit-ai/googleai';
+import { gemini15Pro } from '@genkit-ai/googleai';
 
 const ai = genkit({
   plugins: [
+    vertexAI(),
     ...
-    myAwesomeEval({
-      judge: gemini15Flash,
-      judgeConfig: {
-        safetySettings: [
-          {
-            category: 'HARM_CATEGORY_HATE_SPEECH',
-            threshold: 'BLOCK_NONE',
-          },
-          {
-            category: 'HARM_CATEGORY_DANGEROUS_CONTENT',
-            threshold: 'BLOCK_NONE',
-          },
-          {
-            category: 'HARM_CATEGORY_HARASSMENT',
-            threshold: 'BLOCK_NONE',
-          },
-          {
-            category: 'HARM_CATEGORY_SEXUALLY_EXPLICIT',
-            threshold: 'BLOCK_NONE',
-          },
-        ],
-      },
-      metrics: [
-        MyAwesomeMetric.DELICIOUSNESS,
-        MyAwesomeMetric.US_PHONE_REGEX_MATCH
-      ],
+    myCustomEvals({
+      judge: gemini15Pro,
     }),
   ],
   ...
 });
 ```
 
-## Testing
+## Using your custom evaluators
 
-The same issues that apply to evaluating the quality of the output of a generative AI feature apply to evaluating the judging capacity of an LLM-based evaluator.
+Once you instatiate your custom providers within the Genkit context (either through a plugin or directly), they are ready to be used. Let us try out the deliciousness evaluator with a few sample inputs and outputs. 
 
-To get a sense of whether the custom evaluator performs at the expected level, create a set of test cases that have a clear right and wrong answer.
-
-As an example for deliciousness, that might look like a json file `deliciousness_dataset.json`:
+Create a json file `deliciousness_dataset.json` with the following content:
 
 ```json
 [
@@ -396,8 +341,6 @@ As an example for deliciousness, that might look like a json file `deliciousness
 ]
 ```
 
-These examples can be human generated or you can ask an LLM to help create a set of test cases that can be curated. There are many available benchmark datasets that can be used as well.
-
 Then use the Genkit CLI to run the evaluator against these test cases.
 
 ```posix-terminal
@@ -408,3 +351,5 @@ genkit eval:run deliciousness_dataset.json
 ```
 
 Navigate to `localhost:4000/evaluate` to view your results in the Genkit UI.
+
+It is important to note that confidence in custom evaluators will increase as you benchmark them with standard datasets or approaches. Iterate on the results of such benchmarks to improve your evaluators' performance till it reaches the desired quality.
\ No newline at end of file