Skip to content

Commit

Permalink
Merge pull request #28 from parea-ai/PAI-826-fix-ts-sdk-async-evals
Browse files Browse the repository at this point in the history
feat(evals): async evals + cookbook using deployments
  • Loading branch information
jalexanderII authored Mar 13, 2024
2 parents b1b8ca6 + 3345a9d commit b04a39f
Show file tree
Hide file tree
Showing 3 changed files with 148 additions and 3 deletions.
144 changes: 144 additions & 0 deletions src/cookbook/tracing_with_fetched_deployed_prompt.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
import * as dotenv from 'dotenv';

import { Log, UseDeployedPromptResponse } from '../types';
import { Parea } from '../client';
import { trace, traceInsert } from '../utils/trace_utils';
import OpenAI from 'openai';
import { patchOpenAI } from '../utils/wrap_openai';

dotenv.config();

const CONTEXT = `Company: Nike. 2023
FORM 10-K 35
OPERATING SEGMENTS
As discussed in Note 15 2014 Operating Segments and Related Information in the accompanying Notes to the Consolidated Financial Statements, our operating segments are evidence of the structure of the Company's internal organization. The NIKE Brand segments are defined by geographic regions for operations participating in NIKE Brand sales activity.
The breakdown of Revenues is as follows:
\\n\\n(Dollars in millions)
\\n\\nFISCAL 2023 FISCAL 2022
\\n\\n% CHANGE\\n\\n% CHANGE EXCLUDING CURRENCY (1) CHANGES FISCAL 2021\\n\\n% CHANGE\\n\\n
North America Europe, Middle East & Africa Greater China\\n\\n$\\n\\n21,608 $ 13,418 7,248\\n\\n18,353 12,479 7,547\\n\\n18 % 8 % -4 %\\n\\n18 % $ 21 % 4 %\\n\\n17,179 11,456 8,290\\n\\n7 % 9 % -9 %\\n\\nAsia Pacific & Latin America Global Brand Divisions\\n\\n(3)\\n\\n(2)\\n\\n6,431 58\\n\\n5,955 102\\n\\n8 % -43 %\\n\\n17 % -43 %\\n\\n5,343 25\\n\\n11 % 308 %\\n\\nTOTAL NIKE BRAND Converse\\n\\n$\\n\\n48,763 $ 2,427\\n\\n44,436 2,346\\n\\n10 % 3 %\\n\\n16 % $ 8 %\\n\\n42,293 2,205\\n\\n5 % 6 %\\n\\n(4)\\n\\nCorporate TOTAL NIKE, INC. REVENUES\\n\\n$\\n\\n27\\n\\n51,217 $\\n\\n(72) 46,710\\n\\n— 10 %\\n\\n— 16 % $\\n\\n40 44,538\\n\\n— 5 %`;

const openai = new OpenAI({
apiKey: process.env.OPENAI_API_KEY,
});

const p = new Parea(process.env.PAREA_API_KEY);

patchOpenAI(openai);

async function callOpenAI(
messages?: Record<string, any>[],
model?: string,
temperature?: number,
): Promise<OpenAI.Chat.Completions.ChatCompletionMessage> {
// @ts-ignore
const response = await openai.chat.completions.create({ model, messages, temperature });
return response.choices[0].message;
}

const getPrompt = async (
deployment_id: string,
llm_inputs: Record<string, any>,
): Promise<UseDeployedPromptResponse> => {
return await p.getPrompt({ deployment_id, llm_inputs });
};

async function llmJudgeEval(log: Log): Promise<number> {
const question = log?.inputs?.question;
const output = log.output;
const target = log?.target ?? 'Global Brand Divisions';
try {
const response = await callOpenAI(
[
{
role: 'system',
content: 'You are CompareGPT, a machine to verify the groundedness of predictions. Answer with only yes/no.',
},
{
role: 'user',
content: `You are given a question, the corresponding ground-truth answer and a prediction from a model. Compare the "Ground-truth answer" and the "Prediction" to determine whether the prediction correctly answers the question. All information in the ground-truth answer must be present in the prediction, including numbers and dates. You must answer "no" if there are any specific details in the ground-truth answer that are not mentioned in the prediction. There should be no contradicting statements in the prediction. The prediction may contain extra information. If the prediction states something as a possibility, treat it as a definitive answer.
Question: ${question}
Ground-truth answer: ${target}
Prediction: ${output}
CompareGPT response:`,
},
],
'gpt-3.5-turbo',
1.0,
);
return (response?.content || '')?.toLowerCase()?.includes('yes') ? 1.0 : 0.0;
} catch (e) {
return 0.0;
}
}

const _ragTemplate = async (deployment_id: string, llm_inputs: Record<string, any>): Promise<string> => {
// The Deployed Prompt is:
// Use the following pieces of context to answer the question. Do not make up an answer if no context is provided to help answer it.
//
// Context:
// ---------
// {{context}}
//
// ---------
// Question: {{question}}
// ---------
//
// Answer:

const deployedPrompt: UseDeployedPromptResponse = await getPrompt(deployment_id, llm_inputs);
const response = await callOpenAI(
deployedPrompt.prompt?.messages,
deployedPrompt.model,
deployedPrompt?.model_params?.temp,
);
console.log('deployedPrompt', deployedPrompt);
return response.content ?? '';
};

const ragTemplate = trace(
'ragTemplate',
async (context: string, question: string): Promise<string> => {
const deployment_id = 'p-dg9vE-qCJBA84QAnW9fQc';
const llm_inputs = { context, question };
traceInsert({ deployment_id: deployment_id });
return await _ragTemplate(deployment_id, llm_inputs);
},
{
evalFuncs: [llmJudgeEval],
},
);

async function main() {
return await ragTemplate(
CONTEXT,
'Which operating segment contributed least to total Nike brand revenue in fiscal 2023?',
);
}

export async function runExperiment() {
const e = p.experiment(
[
{
context: CONTEXT,
question: 'Which operating segment contributed least to total Nike brand revenue in fiscal 2023?',
target: 'Global Brand Divisions',
},
{
context: CONTEXT,
question: 'Which operating segment contributed most to total Nike brand revenue in fiscal 2023?',
target: 'North America',
},
],
ragTemplate,
);
return await e.run();
}

main().then((result) => console.log(result));

runExperiment().then(() => {
console.log('Experiment complete!');
});
1 change: 1 addition & 0 deletions src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,7 @@ export type TraceOptions = {
accessOutputOfFunc?: (arg0: any) => string;
applyEvalFrac?: number;
deploymentId?: string;
target?: string;
};

export type UpdateLog = {
Expand Down
6 changes: 3 additions & 3 deletions src/utils/trace_utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -194,16 +194,16 @@ export const handleRunningEvals = async (
traceLog.output = outputForEvalMetrics;
const scores: EvaluationResult[] = [];

options?.evalFuncs.forEach((func) => {
for (const func of options?.evalFuncs) {
try {
const score = func(traceLog);
const score = await func(traceLog);
if (score !== undefined && score !== null) {
scores.push({ name: func.name, score });
}
} catch (e) {
console.error(`Error occurred calling evaluation function '${func.name}', ${e}`, e);
}
});
}

await pareaLogger.updateLog({ trace_id: traceId, field_name_to_value_map: { scores: scores } });
currentTraceData.traceLog.scores = scores;
Expand Down

0 comments on commit b04a39f

Please sign in to comment.