-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: add evaluator, properties and llm app
- Loading branch information
1 parent
715fc90
commit a75e167
Showing
11 changed files
with
1,998 additions
and
11 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
import openai | ||
from evaluator import EvalProperty, OutputFormat, PropertyResult, TestCase | ||
|
||
property_llm = "gpt-3.5-turbo-0613" | ||
|
||
|
||
def evaluate_property_with_llm( | ||
model: str, system_message: str, user_message: str | ||
) -> PropertyResult: | ||
return openai.ChatCompletion.create( | ||
model=model, | ||
response_model=PropertyResult, | ||
messages=[ | ||
{"role": "system", "content": system_message}, | ||
{"role": "user", "content": user_message}, | ||
], | ||
) | ||
|
||
|
||
def factually_consistent(test_case: TestCase, llm_app_result: OutputFormat) -> PropertyResult: | ||
if test_case.reference_output and llm_app_result.answer: | ||
result = evaluate_property_with_llm( | ||
model=property_llm, | ||
system_message="Evaluate the answer. The answer should be factually consistent with the reference answer. If not, explain why.", | ||
user_message=f"Answer: {llm_app_result.answer}\nReference Answer: {test_case.reference_output.answer}", | ||
) | ||
else: | ||
result = None | ||
return result | ||
|
||
|
||
def improves_historical_answer(test_case: TestCase, llm_app_result: OutputFormat) -> PropertyResult: | ||
if test_case.test_input and test_case.historical_output and llm_app_result.answer: | ||
result = evaluate_property_with_llm( | ||
model=property_llm, | ||
system_message="Evaluate the new answer. Is the new answer better than the old answer? Explain why.", | ||
user_message=f"Question: {test_case.test_input.question}\nOld answer: {test_case.historical_output.answer}\nNew answer: {llm_app_result.answer}", | ||
) | ||
else: | ||
result = None | ||
return result | ||
|
||
|
||
def takes_feedback_into_account( | ||
test_case: TestCase, llm_app_result: OutputFormat | ||
) -> PropertyResult: | ||
if ( | ||
test_case.test_input | ||
and test_case.historical_output | ||
and llm_app_result.answer | ||
and test_case.historical_feedback | ||
): | ||
result = evaluate_property_with_llm( | ||
model=property_llm, | ||
system_message="Evaluate the new answer. Does the new answer improve upon the old one by taking the feedback into account? Explain why.", | ||
user_message=f"Question: {test_case.test_input.question}\nOld answer: {test_case.historical_output.answer}\nOld feedback: {test_case.historical_feedback}\nNew answer: {llm_app_result.answer}", | ||
) | ||
else: | ||
result = None | ||
return result | ||
|
||
|
||
def length_within_bounds(test_case: TestCase, llm_app_result: OutputFormat) -> PropertyResult: | ||
if test_case.reference_output and llm_app_result.answer: | ||
if len(llm_app_result.answer) <= 1.2 * len(test_case.reference_output.answer): | ||
result = PropertyResult(feedback="The answer is not too long.", pass_fail=True) | ||
else: | ||
result = PropertyResult(feedback="The answer is too long.", pass_fail=False) | ||
else: | ||
result = None | ||
return result | ||
|
||
|
||
properties = [ | ||
EvalProperty( | ||
property_name="FactuallyConsistent", | ||
description="The answer is factually consistent with the reference answer.", | ||
eval_func=factually_consistent, | ||
), | ||
# EvalProperty( | ||
# property_name="CorrectLanguage" | ||
# description="The answer is in the same language as the question.", | ||
# ), | ||
EvalProperty( | ||
property_name="ImprovesHistoricalAnswer", | ||
description="The answer improves upon the historical answer. It is more complete, more concise, or more accurate.", | ||
eval_func=improves_historical_answer, | ||
), | ||
EvalProperty( | ||
property_name="TakesFeedbackIntoAccount", | ||
description="The answer improves upon the historical answer by taking the feedback into account.", | ||
eval_func=takes_feedback_into_account, | ||
), | ||
EvalProperty( | ||
property_name="LengthWithinBounds", | ||
description="The answer is max 20% longer than the reference answer.", | ||
eval_func=length_within_bounds, | ||
), | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
import json | ||
import os | ||
from datetime import datetime | ||
from typing import Callable, Optional | ||
|
||
from llm_app import BaseApp, InputFormat, OutputFormat | ||
from pydantic import BaseModel | ||
from tqdm import tqdm | ||
|
||
|
||
class TestCase(BaseModel): | ||
test_id: str | ||
test_input: InputFormat | ||
reference_output: Optional[OutputFormat] = None | ||
historical_output: Optional[OutputFormat] = None | ||
historical_feedback: Optional[str] = None | ||
|
||
|
||
class EvalProperty(BaseModel): | ||
property_name: str | ||
description: str | ||
eval_func: Callable | ||
|
||
|
||
class PropertyResult(BaseModel): | ||
feedback: str | ||
pass_fail: bool | ||
property_name: Optional[str] = None | ||
|
||
|
||
class TestCaseResult(BaseModel): | ||
test_case_id: str | ||
output: OutputFormat | ||
property_results: list[PropertyResult] | ||
|
||
|
||
class Evaluator: | ||
def __init__( | ||
self, | ||
test_set: list[TestCase], | ||
properties: list[EvalProperty], | ||
results_dir: str = "eval_results", | ||
): | ||
self.test_set = test_set | ||
self.properties = properties | ||
self.results_dir = results_dir | ||
|
||
def evaluate( | ||
self, | ||
llm_app: BaseApp, | ||
exp_name: Optional[str] = None, | ||
exp_descr: str = "", | ||
): | ||
# If no experiment name is provided, use the current timestamp | ||
if not exp_name: | ||
exp_name = datetime.now().strftime("%Y%m%d_%H%M%S") | ||
# Create experiment directory | ||
exp_dir = os.path.join(self.results_dir, exp_name) | ||
os.makedirs(exp_dir, exist_ok=True) | ||
|
||
# Loop over test cases | ||
for test_case in tqdm( | ||
self.test_set, desc="Evaluating test cases", unit="test case", total=len(self.test_set) | ||
): | ||
# Pass the test case to the LLM app | ||
app_output = llm_app(app_input=test_case.test_input) | ||
# Evaluate properties | ||
property_results = [] | ||
for prop in self.properties: | ||
print(f"Evaluating property {prop.property_name}") | ||
r = prop.eval_func(test_case=test_case, llm_app_result=app_output) | ||
# If the property is None, then it is not applicable to this test case, so skip it | ||
if r: | ||
# Store the property results per test case in a list | ||
property_results.append( | ||
PropertyResult( | ||
property_name=prop.property_name, | ||
feedback=r.feedback, | ||
pass_fail=r.pass_fail, | ||
) | ||
) | ||
# Store results as JSON | ||
tcr = TestCaseResult( | ||
test_case_id=test_case.test_id, output=app_output, property_results=property_results | ||
) | ||
tcr_json = tcr.model_dump_json() | ||
with open(os.path.join(exp_dir, f"{tcr.test_case_id}.json"), "w") as f: | ||
f.write(tcr_json) | ||
# Save the Llm app config dict as JSON | ||
with open(os.path.join(exp_dir, "llm_app.json"), "w") as f: | ||
f.write(json.dumps(llm_app.cfg)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
from typing import Optional | ||
|
||
import instructor | ||
import openai | ||
from pydantic import BaseModel | ||
|
||
instructor.patch() | ||
|
||
|
||
class InputFormat(BaseModel): | ||
question: str | ||
|
||
|
||
class OutputFormat(BaseModel): | ||
answer: str | ||
|
||
|
||
class BaseApp: | ||
def __init__(self, config: Optional[dict] = None): | ||
self.cfg = {} | ||
if config: | ||
self.cfg.update(config) | ||
|
||
def __call__(self, app_input: InputFormat) -> OutputFormat: | ||
return OutputFormat(answer="The answer is always 42.") | ||
|
||
|
||
class GptBaseApp(BaseApp): | ||
def __init__(self, config: Optional[dict] = None): | ||
self.cfg = { | ||
"gpt_version": "gpt-3.5-turbo-0613", | ||
"system_prompt": "You are a first-aid expert. Answer the user question. Be accurate and concise.", | ||
} | ||
if config: | ||
self.cfg.update(config) | ||
|
||
def __call__(self, app_input: InputFormat) -> OutputFormat: | ||
result: OutputFormat = openai.ChatCompletion.create( | ||
model=self.cfg["gpt_version"], | ||
response_model=OutputFormat, | ||
messages=[ | ||
{"role": "system", "content": self.cfg["system_prompt"]}, | ||
{"role": "user", "content": app_input.question}, | ||
], | ||
) | ||
return result | ||
|
||
|
||
class HumanApp(BaseApp): | ||
def __call__(self, app_input: InputFormat) -> OutputFormat: | ||
answer = input(app_input.question) | ||
return OutputFormat(answer=answer) |
Oops, something went wrong.