feat: add evaluator, properties and llm app

superlinear-ai · Oct 1, 2023 · a75e167 · a75e167
1 parent 715fc90
commit a75e167
Show file tree

Hide file tree

Showing 11 changed files with 1,998 additions and 11 deletions.
diff --git a/.gitignore b/.gitignore
@@ -23,9 +23,9 @@ data/
 .hypothesis/
 
 # Jupyter
-*.ipynb
+#*.ipynb
 .ipynb_checkpoints/
-notebooks/
+#notebooks/
 
 # macOS
 .DS_Store
@@ -62,3 +62,7 @@ __pycache__/
 
 # VS Code
 .vscode/
+
+# Project
+keys
+openai_key
diff --git a/README.md b/README.md
@@ -4,6 +4,23 @@
 
 Evaluates LLM-based applications.
 
+## To-do's
+- [x] Convert my EHBO notes into question-answer pairs, using OpenAI Function Calling.
+- [/] Turn the question-answer pairs into a test set.
+- [x] Build Streamlit app for testing myself.
+    - [] Bug: when I click on the 'Evaluate' button, the app goes to the next question.
+- [x] Build LLM component to evaluate the given answers by comparing it with the reference answer.
+- [x] Build LLM 'app' that can answer the questions.
+- [x] Evaluate the LLM app with the LLM evaluator.
+- [] Streamlit page to view the evaluation results.
+- [] Add the question-answer pairs as the knowledge base for that app.
+- [] Evaluate the LLM app with the LLM evaluator.
+- [] Compare the results.
+- [] Streamlit page to view, edit and add test cases.
+- [] Cache the OpenAI function calls.
+
+
+
 ## Using
 
 _Python package_: to add and install this package as a dependency of your project, run `poetry add llm-app-eval`.

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -21,6 +21,9 @@ version_files = ["pyproject.toml:version"]
 poethepoet = ">=0.20.0"
 python = ">=3.9,<3.9.7 || >3.9.7,<4.0"
 streamlit = ">=1.19.0"
+instructor = "^0.2.8"
+langchain = "^0.0.305"
+llama-index = "^0.8.36"
 
 [tool.poetry.group.test.dependencies]  # https://python-poetry.org/docs/master/managing-dependencies/
 black = ">=23.3.0"

diff --git a/src/llm_app_eval/app.py b/src/llm_app_eval/app.py
@@ -2,6 +2,48 @@
 
 from importlib.metadata import version
 
+import numpy as np
 import streamlit as st
+from evaluator import Evaluator
+from qa_extraction import load_qa_pairs
 
 st.title(f"llm-app-eval v{version('llm-app-eval')}")  # type: ignore[no-untyped-call]
+
+
+qa_pairs = load_qa_pairs("src/llm_app_eval/data/question_answer_pairs.csv")
+evaluator = Evaluator(llm="gpt-4")
+
+# Shuffle the question and answer pairs
+np.random.seed(42)
+np.random.shuffle(qa_pairs)
+# Display a question and answer pair
+if "idx" in st.session_state:
+    idx = st.session_state.idx
+else:
+    idx = 0
+    st.session_state.idx = idx
+st.write(f"Question {idx + 1} of {len(qa_pairs)}")
+qa = qa_pairs[idx]
+st.header("Question")
+st.write(qa.question)
+st.header("Answer")
+answer = st.text_input("Answer")
+st.header("Reference Answer")
+st.write(qa.answer)
+
+
+eval_button = st.button("Evaluate")
+if eval_button:
+    result = evaluator.evaluate(qa.question, answer, qa.answer)
+    st.write("✅" if result.pass_fail else "❌")
+    st.write(result.feedback)
+    st.session_state.idx = min(st.session_state.idx + 1, len(qa_pairs) - 1)
+else:
+    # Display previous and next buttons
+    col1, col2, col3 = st.columns(3)
+    if col1.button("Previous"):
+        st.session_state.idx = max(st.session_state.idx - 1, 0)
+    if col2.button("Random"):
+        st.session_state.idx = np.random.randint(0, len(qa_pairs))
+    if col3.button("Next"):
+        st.session_state.idx = min(st.session_state.idx + 1, len(qa_pairs) - 1)
diff --git a/src/llm_app_eval/eval_properties.py b/src/llm_app_eval/eval_properties.py
@@ -0,0 +1,99 @@
+import openai
+from evaluator import EvalProperty, OutputFormat, PropertyResult, TestCase
+
+property_llm = "gpt-3.5-turbo-0613"
+
+
+def evaluate_property_with_llm(
+    model: str, system_message: str, user_message: str
+) -> PropertyResult:
+    return openai.ChatCompletion.create(
+        model=model,
+        response_model=PropertyResult,
+        messages=[
+            {"role": "system", "content": system_message},
+            {"role": "user", "content": user_message},
+        ],
+    )
+
+
+def factually_consistent(test_case: TestCase, llm_app_result: OutputFormat) -> PropertyResult:
+    if test_case.reference_output and llm_app_result.answer:
+        result = evaluate_property_with_llm(
+            model=property_llm,
+            system_message="Evaluate the answer. The answer should be factually consistent with the reference answer. If not, explain why.",
+            user_message=f"Answer: {llm_app_result.answer}\nReference Answer: {test_case.reference_output.answer}",
+        )
+    else:
+        result = None
+    return result
+
+
+def improves_historical_answer(test_case: TestCase, llm_app_result: OutputFormat) -> PropertyResult:
+    if test_case.test_input and test_case.historical_output and llm_app_result.answer:
+        result = evaluate_property_with_llm(
+            model=property_llm,
+            system_message="Evaluate the new answer. Is the new answer better than the old answer? Explain why.",
+            user_message=f"Question: {test_case.test_input.question}\nOld answer: {test_case.historical_output.answer}\nNew answer: {llm_app_result.answer}",
+        )
+    else:
+        result = None
+    return result
+
+
+def takes_feedback_into_account(
+    test_case: TestCase, llm_app_result: OutputFormat
+) -> PropertyResult:
+    if (
+        test_case.test_input
+        and test_case.historical_output
+        and llm_app_result.answer
+        and test_case.historical_feedback
+    ):
+        result = evaluate_property_with_llm(
+            model=property_llm,
+            system_message="Evaluate the new answer. Does the new answer improve upon the old one by taking the feedback into account? Explain why.",
+            user_message=f"Question: {test_case.test_input.question}\nOld answer: {test_case.historical_output.answer}\nOld feedback: {test_case.historical_feedback}\nNew answer: {llm_app_result.answer}",
+        )
+    else:
+        result = None
+    return result
+
+
+def length_within_bounds(test_case: TestCase, llm_app_result: OutputFormat) -> PropertyResult:
+    if test_case.reference_output and llm_app_result.answer:
+        if len(llm_app_result.answer) <= 1.2 * len(test_case.reference_output.answer):
+            result = PropertyResult(feedback="The answer is not too long.", pass_fail=True)
+        else:
+            result = PropertyResult(feedback="The answer is too long.", pass_fail=False)
+    else:
+        result = None
+    return result
+
+
+properties = [
+    EvalProperty(
+        property_name="FactuallyConsistent",
+        description="The answer is factually consistent with the reference answer.",
+        eval_func=factually_consistent,
+    ),
+    # EvalProperty(
+    #     property_name="CorrectLanguage"
+    #     description="The answer is in the same language as the question.",
+    # ),
+    EvalProperty(
+        property_name="ImprovesHistoricalAnswer",
+        description="The answer improves upon the historical answer. It is more complete, more concise, or more accurate.",
+        eval_func=improves_historical_answer,
+    ),
+    EvalProperty(
+        property_name="TakesFeedbackIntoAccount",
+        description="The answer improves upon the historical answer by taking the feedback into account.",
+        eval_func=takes_feedback_into_account,
+    ),
+    EvalProperty(
+        property_name="LengthWithinBounds",
+        description="The answer is max 20% longer than the reference answer.",
+        eval_func=length_within_bounds,
+    ),
+]
diff --git a/src/llm_app_eval/evaluator.py b/src/llm_app_eval/evaluator.py
@@ -0,0 +1,91 @@
+import json
+import os
+from datetime import datetime
+from typing import Callable, Optional
+
+from llm_app import BaseApp, InputFormat, OutputFormat
+from pydantic import BaseModel
+from tqdm import tqdm
+
+
+class TestCase(BaseModel):
+    test_id: str
+    test_input: InputFormat
+    reference_output: Optional[OutputFormat] = None
+    historical_output: Optional[OutputFormat] = None
+    historical_feedback: Optional[str] = None
+
+
+class EvalProperty(BaseModel):
+    property_name: str
+    description: str
+    eval_func: Callable
+
+
+class PropertyResult(BaseModel):
+    feedback: str
+    pass_fail: bool
+    property_name: Optional[str] = None
+
+
+class TestCaseResult(BaseModel):
+    test_case_id: str
+    output: OutputFormat
+    property_results: list[PropertyResult]
+
+
+class Evaluator:
+    def __init__(
+        self,
+        test_set: list[TestCase],
+        properties: list[EvalProperty],
+        results_dir: str = "eval_results",
+    ):
+        self.test_set = test_set
+        self.properties = properties
+        self.results_dir = results_dir
+
+    def evaluate(
+        self,
+        llm_app: BaseApp,
+        exp_name: Optional[str] = None,
+        exp_descr: str = "",
+    ):
+        # If no experiment name is provided, use the current timestamp
+        if not exp_name:
+            exp_name = datetime.now().strftime("%Y%m%d_%H%M%S")
+        # Create experiment directory
+        exp_dir = os.path.join(self.results_dir, exp_name)
+        os.makedirs(exp_dir, exist_ok=True)
+
+        # Loop over test cases
+        for test_case in tqdm(
+            self.test_set, desc="Evaluating test cases", unit="test case", total=len(self.test_set)
+        ):
+            # Pass the test case to the LLM app
+            app_output = llm_app(app_input=test_case.test_input)
+            # Evaluate properties
+            property_results = []
+            for prop in self.properties:
+                print(f"Evaluating property {prop.property_name}")
+                r = prop.eval_func(test_case=test_case, llm_app_result=app_output)
+                # If the property is None, then it is not applicable to this test case, so skip it
+                if r:
+                    # Store the property results per test case in a list
+                    property_results.append(
+                        PropertyResult(
+                            property_name=prop.property_name,
+                            feedback=r.feedback,
+                            pass_fail=r.pass_fail,
+                        )
+                    )
+            # Store results as JSON
+            tcr = TestCaseResult(
+                test_case_id=test_case.test_id, output=app_output, property_results=property_results
+            )
+            tcr_json = tcr.model_dump_json()
+            with open(os.path.join(exp_dir, f"{tcr.test_case_id}.json"), "w") as f:
+                f.write(tcr_json)
+        # Save the Llm app config dict as JSON
+        with open(os.path.join(exp_dir, "llm_app.json"), "w") as f:
+            f.write(json.dumps(llm_app.cfg))
diff --git a/src/llm_app_eval/llm_app.py b/src/llm_app_eval/llm_app.py
@@ -0,0 +1,52 @@
+from typing import Optional
+
+import instructor
+import openai
+from pydantic import BaseModel
+
+instructor.patch()
+
+
+class InputFormat(BaseModel):
+    question: str
+
+
+class OutputFormat(BaseModel):
+    answer: str
+
+
+class BaseApp:
+    def __init__(self, config: Optional[dict] = None):
+        self.cfg = {}
+        if config:
+            self.cfg.update(config)
+
+    def __call__(self, app_input: InputFormat) -> OutputFormat:
+        return OutputFormat(answer="The answer is always 42.")
+
+
+class GptBaseApp(BaseApp):
+    def __init__(self, config: Optional[dict] = None):
+        self.cfg = {
+            "gpt_version": "gpt-3.5-turbo-0613",
+            "system_prompt": "You are a first-aid expert. Answer the user question. Be accurate and concise.",
+        }
+        if config:
+            self.cfg.update(config)
+
+    def __call__(self, app_input: InputFormat) -> OutputFormat:
+        result: OutputFormat = openai.ChatCompletion.create(
+            model=self.cfg["gpt_version"],
+            response_model=OutputFormat,
+            messages=[
+                {"role": "system", "content": self.cfg["system_prompt"]},
+                {"role": "user", "content": app_input.question},
+            ],
+        )
+        return result
+
+
+class HumanApp(BaseApp):
+    def __call__(self, app_input: InputFormat) -> OutputFormat:
+        answer = input(app_input.question)
+        return OutputFormat(answer=answer)