Skip to content

Commit

Permalink
feat: add evaluator, properties and llm app
Browse files Browse the repository at this point in the history
  • Loading branch information
StijnGoossens committed Oct 1, 2023
1 parent 715fc90 commit a75e167
Show file tree
Hide file tree
Showing 11 changed files with 1,998 additions and 11 deletions.
8 changes: 6 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@ data/
.hypothesis/

# Jupyter
*.ipynb
#*.ipynb
.ipynb_checkpoints/
notebooks/
#notebooks/

# macOS
.DS_Store
Expand Down Expand Up @@ -62,3 +62,7 @@ __pycache__/

# VS Code
.vscode/

# Project
keys
openai_key
17 changes: 17 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,23 @@

Evaluates LLM-based applications.

## To-do's
- [x] Convert my EHBO notes into question-answer pairs, using OpenAI Function Calling.
- [/] Turn the question-answer pairs into a test set.
- [x] Build Streamlit app for testing myself.
- [] Bug: when I click on the 'Evaluate' button, the app goes to the next question.
- [x] Build LLM component to evaluate the given answers by comparing it with the reference answer.
- [x] Build LLM 'app' that can answer the questions.
- [x] Evaluate the LLM app with the LLM evaluator.
- [] Streamlit page to view the evaluation results.
- [] Add the question-answer pairs as the knowledge base for that app.
- [] Evaluate the LLM app with the LLM evaluator.
- [] Compare the results.
- [] Streamlit page to view, edit and add test cases.
- [] Cache the OpenAI function calls.



## Using

_Python package_: to add and install this package as a dependency of your project, run `poetry add llm-app-eval`.
Expand Down
1,247 changes: 1,238 additions & 9 deletions poetry.lock

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ version_files = ["pyproject.toml:version"]
poethepoet = ">=0.20.0"
python = ">=3.9,<3.9.7 || >3.9.7,<4.0"
streamlit = ">=1.19.0"
instructor = "^0.2.8"
langchain = "^0.0.305"
llama-index = "^0.8.36"

[tool.poetry.group.test.dependencies] # https://python-poetry.org/docs/master/managing-dependencies/
black = ">=23.3.0"
Expand Down
42 changes: 42 additions & 0 deletions src/llm_app_eval/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,48 @@

from importlib.metadata import version

import numpy as np
import streamlit as st
from evaluator import Evaluator
from qa_extraction import load_qa_pairs

st.title(f"llm-app-eval v{version('llm-app-eval')}") # type: ignore[no-untyped-call]


qa_pairs = load_qa_pairs("src/llm_app_eval/data/question_answer_pairs.csv")
evaluator = Evaluator(llm="gpt-4")

# Shuffle the question and answer pairs
np.random.seed(42)
np.random.shuffle(qa_pairs)
# Display a question and answer pair
if "idx" in st.session_state:
idx = st.session_state.idx
else:
idx = 0
st.session_state.idx = idx
st.write(f"Question {idx + 1} of {len(qa_pairs)}")
qa = qa_pairs[idx]
st.header("Question")
st.write(qa.question)
st.header("Answer")
answer = st.text_input("Answer")
st.header("Reference Answer")
st.write(qa.answer)


eval_button = st.button("Evaluate")
if eval_button:
result = evaluator.evaluate(qa.question, answer, qa.answer)
st.write("✅" if result.pass_fail else "❌")
st.write(result.feedback)
st.session_state.idx = min(st.session_state.idx + 1, len(qa_pairs) - 1)
else:
# Display previous and next buttons
col1, col2, col3 = st.columns(3)
if col1.button("Previous"):
st.session_state.idx = max(st.session_state.idx - 1, 0)
if col2.button("Random"):
st.session_state.idx = np.random.randint(0, len(qa_pairs))
if col3.button("Next"):
st.session_state.idx = min(st.session_state.idx + 1, len(qa_pairs) - 1)
99 changes: 99 additions & 0 deletions src/llm_app_eval/eval_properties.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import openai
from evaluator import EvalProperty, OutputFormat, PropertyResult, TestCase

property_llm = "gpt-3.5-turbo-0613"


def evaluate_property_with_llm(
model: str, system_message: str, user_message: str
) -> PropertyResult:
return openai.ChatCompletion.create(
model=model,
response_model=PropertyResult,
messages=[
{"role": "system", "content": system_message},
{"role": "user", "content": user_message},
],
)


def factually_consistent(test_case: TestCase, llm_app_result: OutputFormat) -> PropertyResult:
if test_case.reference_output and llm_app_result.answer:
result = evaluate_property_with_llm(
model=property_llm,
system_message="Evaluate the answer. The answer should be factually consistent with the reference answer. If not, explain why.",
user_message=f"Answer: {llm_app_result.answer}\nReference Answer: {test_case.reference_output.answer}",
)
else:
result = None
return result


def improves_historical_answer(test_case: TestCase, llm_app_result: OutputFormat) -> PropertyResult:
if test_case.test_input and test_case.historical_output and llm_app_result.answer:
result = evaluate_property_with_llm(
model=property_llm,
system_message="Evaluate the new answer. Is the new answer better than the old answer? Explain why.",
user_message=f"Question: {test_case.test_input.question}\nOld answer: {test_case.historical_output.answer}\nNew answer: {llm_app_result.answer}",
)
else:
result = None
return result


def takes_feedback_into_account(
test_case: TestCase, llm_app_result: OutputFormat
) -> PropertyResult:
if (
test_case.test_input
and test_case.historical_output
and llm_app_result.answer
and test_case.historical_feedback
):
result = evaluate_property_with_llm(
model=property_llm,
system_message="Evaluate the new answer. Does the new answer improve upon the old one by taking the feedback into account? Explain why.",
user_message=f"Question: {test_case.test_input.question}\nOld answer: {test_case.historical_output.answer}\nOld feedback: {test_case.historical_feedback}\nNew answer: {llm_app_result.answer}",
)
else:
result = None
return result


def length_within_bounds(test_case: TestCase, llm_app_result: OutputFormat) -> PropertyResult:
if test_case.reference_output and llm_app_result.answer:
if len(llm_app_result.answer) <= 1.2 * len(test_case.reference_output.answer):
result = PropertyResult(feedback="The answer is not too long.", pass_fail=True)
else:
result = PropertyResult(feedback="The answer is too long.", pass_fail=False)
else:
result = None
return result


properties = [
EvalProperty(
property_name="FactuallyConsistent",
description="The answer is factually consistent with the reference answer.",
eval_func=factually_consistent,
),
# EvalProperty(
# property_name="CorrectLanguage"
# description="The answer is in the same language as the question.",
# ),
EvalProperty(
property_name="ImprovesHistoricalAnswer",
description="The answer improves upon the historical answer. It is more complete, more concise, or more accurate.",
eval_func=improves_historical_answer,
),
EvalProperty(
property_name="TakesFeedbackIntoAccount",
description="The answer improves upon the historical answer by taking the feedback into account.",
eval_func=takes_feedback_into_account,
),
EvalProperty(
property_name="LengthWithinBounds",
description="The answer is max 20% longer than the reference answer.",
eval_func=length_within_bounds,
),
]
91 changes: 91 additions & 0 deletions src/llm_app_eval/evaluator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import json
import os
from datetime import datetime
from typing import Callable, Optional

from llm_app import BaseApp, InputFormat, OutputFormat
from pydantic import BaseModel
from tqdm import tqdm


class TestCase(BaseModel):
test_id: str
test_input: InputFormat
reference_output: Optional[OutputFormat] = None
historical_output: Optional[OutputFormat] = None
historical_feedback: Optional[str] = None


class EvalProperty(BaseModel):
property_name: str
description: str
eval_func: Callable


class PropertyResult(BaseModel):
feedback: str
pass_fail: bool
property_name: Optional[str] = None


class TestCaseResult(BaseModel):
test_case_id: str
output: OutputFormat
property_results: list[PropertyResult]


class Evaluator:
def __init__(
self,
test_set: list[TestCase],
properties: list[EvalProperty],
results_dir: str = "eval_results",
):
self.test_set = test_set
self.properties = properties
self.results_dir = results_dir

def evaluate(
self,
llm_app: BaseApp,
exp_name: Optional[str] = None,
exp_descr: str = "",
):
# If no experiment name is provided, use the current timestamp
if not exp_name:
exp_name = datetime.now().strftime("%Y%m%d_%H%M%S")
# Create experiment directory
exp_dir = os.path.join(self.results_dir, exp_name)
os.makedirs(exp_dir, exist_ok=True)

# Loop over test cases
for test_case in tqdm(
self.test_set, desc="Evaluating test cases", unit="test case", total=len(self.test_set)
):
# Pass the test case to the LLM app
app_output = llm_app(app_input=test_case.test_input)
# Evaluate properties
property_results = []
for prop in self.properties:
print(f"Evaluating property {prop.property_name}")
r = prop.eval_func(test_case=test_case, llm_app_result=app_output)
# If the property is None, then it is not applicable to this test case, so skip it
if r:
# Store the property results per test case in a list
property_results.append(
PropertyResult(
property_name=prop.property_name,
feedback=r.feedback,
pass_fail=r.pass_fail,
)
)
# Store results as JSON
tcr = TestCaseResult(
test_case_id=test_case.test_id, output=app_output, property_results=property_results
)
tcr_json = tcr.model_dump_json()
with open(os.path.join(exp_dir, f"{tcr.test_case_id}.json"), "w") as f:
f.write(tcr_json)
# Save the Llm app config dict as JSON
with open(os.path.join(exp_dir, "llm_app.json"), "w") as f:
f.write(json.dumps(llm_app.cfg))
52 changes: 52 additions & 0 deletions src/llm_app_eval/llm_app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
from typing import Optional

import instructor
import openai
from pydantic import BaseModel

instructor.patch()


class InputFormat(BaseModel):
question: str


class OutputFormat(BaseModel):
answer: str


class BaseApp:
def __init__(self, config: Optional[dict] = None):
self.cfg = {}
if config:
self.cfg.update(config)

def __call__(self, app_input: InputFormat) -> OutputFormat:
return OutputFormat(answer="The answer is always 42.")


class GptBaseApp(BaseApp):
def __init__(self, config: Optional[dict] = None):
self.cfg = {
"gpt_version": "gpt-3.5-turbo-0613",
"system_prompt": "You are a first-aid expert. Answer the user question. Be accurate and concise.",
}
if config:
self.cfg.update(config)

def __call__(self, app_input: InputFormat) -> OutputFormat:
result: OutputFormat = openai.ChatCompletion.create(
model=self.cfg["gpt_version"],
response_model=OutputFormat,
messages=[
{"role": "system", "content": self.cfg["system_prompt"]},
{"role": "user", "content": app_input.question},
],
)
return result


class HumanApp(BaseApp):
def __call__(self, app_input: InputFormat) -> OutputFormat:
answer = input(app_input.question)
return OutputFormat(answer=answer)
Loading

0 comments on commit a75e167

Please sign in to comment.