diff --git a/.gitignore b/.gitignore index 28e67c682..882f9ff57 100644 --- a/.gitignore +++ b/.gitignore @@ -168,4 +168,4 @@ cython_debug/ experiments/ **/fil-result/ src/ragas/_version.py -.vscode/settings.json +.vscode diff --git a/src/experimental/ragas_experimental/llms/prompt.py b/src/experimental/ragas_experimental/llms/prompt.py new file mode 100644 index 000000000..8b0c7f978 --- /dev/null +++ b/src/experimental/ragas_experimental/llms/prompt.py @@ -0,0 +1,141 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod +from dataclasses import dataclass +import json +import typing as t + +from ragas.llms.output_parser import RagasoutputParser +from ragas.llms.prompt import PromptValue + +# Check Pydantic version +from pydantic import BaseModel +import pydantic + +if t.TYPE_CHECKING: + from ragas.llms.base import BaseRagasLLM + from langchain_core.callbacks import Callbacks + +PYDANTIC_V2 = pydantic.VERSION.startswith("2.") + + +class BasePrompt(ABC): + def __init__(self, llm): + self.llm: BaseRagasLLM = llm + + @abstractmethod + async def generate(self, data: t.Any) -> t.Any: + pass + + +def model_to_dict( + model: BaseModel, + by_alias: bool = False, + exclude_unset: bool = False, + exclude_defaults: bool = False, +) -> t.Dict[str, t.Any]: + if PYDANTIC_V2: + return model.model_dump( # type: ignore + by_alias=by_alias, + exclude_unset=exclude_unset, + exclude_defaults=exclude_defaults, + ) + else: + return model.dict( + by_alias=by_alias, + exclude_unset=exclude_unset, + exclude_defaults=exclude_defaults, + ) + + +def to_json(model: t.Any, indent: int = 4) -> str: + if PYDANTIC_V2: + # Pydantic 2.x + return model.model_dump_json(indent=indent) + else: + # Pydantic 1.x + return model.json(indent=indent) + + +def model_to_json_schema(model: t.Type[BaseModel]) -> dict: + if PYDANTIC_V2: + return model.model_json_schema() + else: + return model.schema_json() + +InputModel = t.TypeVar("InputModel", bound=BaseModel) +OutputModel = t.TypeVar("OutputModel", bound=BaseModel) + + +class StringIO(BaseModel): + text: str + + +class PydanticPrompt(BasePrompt, t.Generic[InputModel, OutputModel]): + input_model: t.Type[InputModel] + output_model: t.Type[OutputModel] + instruction: str + examples: t.List[t.Tuple[InputModel, OutputModel]] = [] + + def generate_instruction(self) -> str: + return self.instruction + + def generate_output_signature(self, indent: int = 4) -> str: + schema = model_to_json_schema(self.output_model) + return ( + f"Please return the output in a JSON format that complies with the " + f"following schema as specified in JSON Schema and OpenAPI specification:\n" + f"{schema}" + ) + + def generate_examples(self): + if self.examples: + example_strings = [] + for e in self.examples: + input_data, output_data = e + example_strings.append( + self.instruction + + "\n" + + "input: " + to_json(input_data, indent=4) + + "\n" + + "output: " + to_json(output_data, indent=4) + ) + + return ( + "These are some examples to show how to perform the above instruction\n" + + "\n\n".join(example_strings) + ) + # if no examples are provided + else: + return "" + + def to_string(self, data: InputModel) -> str: + # this needs a check + return ( + self.generate_instruction() + + "\n" + + self.generate_output_signature() + + "\n" + + self.generate_examples() + + "\nNow perform the above instruction with the following input\n" + + "input: " + to_json(data, indent=4) + + "\n" + + "output: " + ) + + async def generate(self, data: InputModel, callbacks: Callbacks) -> OutputModel: + prompt_value = PromptValue(prompt_str=self.to_string(data)) + resp = await self.llm.generate(prompt_value, callbacks=callbacks) + resp_text = resp.generations[0][0].text + parser = RagasoutputParser(pydantic_object=self.output_model) + answer = await parser.aparse(resp_text, prompt_value, self.llm, max_retries=3) + + # TODO: make sure RagasOutputPraser returns the same type as OutputModel + return answer # type: ignore + + +class StringPrompt(BasePrompt): + async def generate(self, data: str) -> str: + prompt_value = PromptValue(prompt_str=data) + llm_result = await self.llm.agenerate_text(prompt_value) + return llm_result.generations[0][0].text \ No newline at end of file diff --git a/src/experimental/ragas_experimental/metrics/__init__.py b/src/experimental/ragas_experimental/metrics/__init__.py new file mode 100644 index 000000000..e41ccd002 --- /dev/null +++ b/src/experimental/ragas_experimental/metrics/__init__.py @@ -0,0 +1,3 @@ +from ._faithfulness import FaithfulnessExperimental + +__all__ = ["FaithfulnessExperimental"] \ No newline at end of file diff --git a/src/experimental/ragas_experimental/metrics/_faithfulness.py b/src/experimental/ragas_experimental/metrics/_faithfulness.py new file mode 100644 index 000000000..81e50c841 --- /dev/null +++ b/src/experimental/ragas_experimental/metrics/_faithfulness.py @@ -0,0 +1,226 @@ +from __future__ import annotations + +import typing as t +import logging +from dataclasses import dataclass + +from pydantic import BaseModel, Field +import numpy as np + +from ragas.metrics.base import EvaluationMode, MetricWithLLM, get_segmenter +from ragas_experimental.llms.prompt import PydanticPrompt + +if t.TYPE_CHECKING: + from langchain_core.callbacks import Callbacks + from ragas.metrics._faithfulness import HasSegmentMethod + + +logger = logging.getLogger(__name__) + + +class FaithfulnessStatements(BaseModel): + question: str = Field(description="The question to answer") + answer: str = Field(description="The answer to the question") + sentences: t.Dict[int, str] = Field( + description="A mapping of sentence index to the sentence" + ) + + +class SentenceComponents(BaseModel): + sentence_index: int = Field(description="The index of the sentence") + simpler_statements: t.List[str] = Field( + description="A list of simpler statements that can be directly inferred from the context" + ) + + +class SentencesSimplified(BaseModel): + sentences: t.List[SentenceComponents] = Field( + description="A list of sentences and their simpler versions" + ) + + +# examples +example_input_1 = FaithfulnessStatements( + question="Who was Albert Einstein and what is he best known for?", + answer="He was a German-born theoretical physicist, widely acknowledged to be one of the greatest and most influential physicists of all time. He was best known for developing the theory of relativity, he also made important contributions to the development of the theory of quantum mechanics.", + sentences={ + 0: "He was a German-born theoretical physicist, widely acknowledged to be one of the greatest and most influential physicists of all time.", + 1: "He was best known for developing the theory of relativity, he also made important contributions to the development of the theory of quantum mechanics.", + }, +) + +example_output_1 = SentencesSimplified( + sentences=[ + SentenceComponents( + sentence_index=0, + simpler_statements=[ + "Albert Einstein was a German-born theoretical physicist.", + "Albert Einstein is recognized as one of the greatest and most influential physicists of all time.", + ], + ), + SentenceComponents( + sentence_index=1, + simpler_statements=[ + "Albert Einstein was best known for developing the theory of relativity.", + "Albert Einstein also made important contributions to the development of the theory of quantum mechanics.", + ], + ), + ] +) + + +class LongFormAnswerPrompt(PydanticPrompt[FaithfulnessStatements, SentencesSimplified]): + instruction = "Given a question, an answer, and sentences from the answer analyze the complexity of each sentence given under 'sentences' and break down each sentence into one or more fully understandable statements while also ensuring no pronouns are used in each statement. Format the outputs in JSON." + input_model = FaithfulnessStatements + output_model = SentencesSimplified + examples = [(example_input_1, example_output_1)] + + +class StatementFaithfulnessAnswer(BaseModel): + statement: str = Field(..., description="the original statement, word-by-word") + reason: str = Field(..., description="the reason of the verdict") + verdict: int = Field(..., description="the verdict(0/1) of the faithfulness.") + + +class NLIStatementOutput(BaseModel): + statements: t.List[StatementFaithfulnessAnswer] + + +class NLIStatementInput(BaseModel): + context: str = Field(..., description="The context of the question") + statements: t.List[str] = Field(..., description="The statements to judge") + + +class NLIStatementPrompt(PydanticPrompt[NLIStatementInput, NLIStatementOutput]): + instruction = "Your task is to judge the faithfulness of a series of statements based on a given context. For each statement you must return verdict as 1 if the statement can be directly inferred based on the context or 0 if the statement can not be directly inferred based on the context." + input_model = NLIStatementInput + output_model = NLIStatementOutput + examples = [ + ( + NLIStatementInput( + context="""John is a student at XYZ University. He is pursuing a degree in Computer Science. He is enrolled in several courses this semester, including Data Structures, Algorithms, and Database Management. John is a diligent student and spends a significant amount of time studying and completing assignments. He often stays late in the library to work on his projects.""", + statements=[ + "John is majoring in Biology.", + "John is taking a course on Artificial Intelligence.", + "John is a dedicated student.", + "John has a part-time job.", + ], + ), + NLIStatementOutput( + statements=[ + StatementFaithfulnessAnswer( + statement="John is majoring in Biology.", + reason="John's major is explicitly mentioned as Computer Science. There is no information suggesting he is majoring in Biology.", + verdict=0, + ), + StatementFaithfulnessAnswer( + statement="John is taking a course on Artificial Intelligence.", + reason="The context mentions the courses John is currently enrolled in, and Artificial Intelligence is not mentioned. Therefore, it cannot be deduced that John is taking a course on AI.", + verdict=0, + ), + StatementFaithfulnessAnswer( + statement="John is a dedicated student.", + reason="The context states that he spends a significant amount of time studying and completing assignments. Additionally, it mentions that he often stays late in the library to work on his projects, which implies dedication.", + verdict=1, + ), + StatementFaithfulnessAnswer( + statement="John has a part-time job.", + reason="There is no information given in the context about John having a part-time job.", + verdict=0, + ), + ] + ), + ), + ( + NLIStatementInput( + context="Photosynthesis is a process used by plants, algae, and certain bacteria to convert light energy into chemical energy.", + statements=[ + "Albert Einstein was a genius.", + ], + ), + NLIStatementOutput( + statements=[ + StatementFaithfulnessAnswer( + statement="Albert Einstein was a genius.", + reason="The context and statement are unrelated", + verdict=0, + ) + ] + ), + ), + ] + + +@dataclass +class FaithfulnessExperimental(MetricWithLLM): + name: str = "faithfulness_experimental" # type: ignore + evaluation_mode: EvaluationMode = EvaluationMode.qac # type: ignore + sentence_segmenter: t.Optional[HasSegmentMethod] = None + max_retries: int = 1 + _reproducibility: int = 1 + + @property + def reproducibility(self): + return self._reproducibility + + @reproducibility.setter + def reproducibility(self, value): + if value < 1: + logger.warning("reproducibility cannot be less than 1, setting to 1") + value = 1 + elif value % 2 == 0: + logger.warning( + "reproducibility level cannot be set to even number, setting to odd" + ) + value += 1 + self._reproducibility = value + + def __post_init__(self): + self.long_form_answer_prompt = LongFormAnswerPrompt(llm=self.llm) + self.nli_statement_prompt = NLIStatementPrompt(llm=self.llm) + if self.sentence_segmenter is None: + # TODO: make this dynamic, taking language from prompt + language = "english" + self.sentence_segmenter = get_segmenter(language=language, clean=False) + + async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: + answer, question, contexts = row["answer"], row["question"], row["contexts"] + + # get the sentences from the answer + sentences = self.sentence_segmenter.segment(answer) + # TODO: why do we do this? + sentences = [ + sentence for sentence in sentences if sentence.strip().endswith(".") + ] + sentence_components = await self.long_form_answer_prompt.generate( + FaithfulnessStatements( + question=question, + answer=answer, + sentences={i: sentence for i, sentence in enumerate(sentences)}, + ), + callbacks=callbacks + ) + + statements = [ + statement + for component in sentence_components.sentences + for statement in component.simpler_statements + ] + verdicts = await self.nli_statement_prompt.generate( + NLIStatementInput( + context="\n".join(contexts), + statements=statements, + ), + callbacks=callbacks + ) + + # compute the score + num_faithful_statements = sum( + verdict.verdict for verdict in verdicts.statements + ) + if len(statements): + score = num_faithful_statements / len(statements) + else: + score = np.nan + return score + diff --git a/src/experimental/tests/test_prompt.py b/src/experimental/tests/test_prompt.py new file mode 100644 index 000000000..f2e406369 --- /dev/null +++ b/src/experimental/tests/test_prompt.py @@ -0,0 +1,100 @@ +from ragas_experimental.llms.prompt import StringPrompt, StringIO +from ragas.llms.base import BaseRagasLLM +from langchain_core.outputs import LLMResult, Generation +from ragas.llms.prompt import PromptValue +from ragas.run_config import RunConfig + +import pytest + + +class EchoLLM(BaseRagasLLM): + def generate_text( # type: ignore + self, + prompt: PromptValue, + ) -> LLMResult: + return LLMResult(generations=[[Generation(text=prompt.to_string())]]) + + async def agenerate_text( # type: ignore + self, + prompt: PromptValue, + ) -> LLMResult: + return LLMResult(generations=[[Generation(text=prompt.to_string())]]) + + +@pytest.mark.asyncio +async def test_string_prompt(): + echo_llm = EchoLLM(run_config=RunConfig()) + prompt = StringPrompt(llm=echo_llm) + assert await prompt.generate("hello") == "hello" + + +expected_generate_output_signature = """\ +Please return the output in the following JSON format based on the StringIO model: +{ + "text": "str" +}\ +""" + + +def test_process_fields(): + from ragas_experimental.llms.prompt import PydanticPrompt, StringIO + from pydantic import BaseModel + from enum import Enum + + class Categories(str, Enum): + science = "science" + commerce = "commerce" + agriculture = "agriculture" + economics = "economics" + + class InputModel(BaseModel): + category: Categories + + class JokeGenerator(PydanticPrompt[InputModel, StringIO]): + instruction = "Generate a joke in the category of {category}." + output_model = StringIO + + echo_llm = EchoLLM(run_config=RunConfig()) + p = JokeGenerator(llm=echo_llm) + _ = p.generate_output_signature() + + # assert expected_generate_output_signature == generation + + +@pytest.mark.asyncio +async def test_pydantic_prompt_io(): + from ragas_experimental.llms.prompt import ( + PydanticPrompt, + StringIO, + ) + + class Prompt(PydanticPrompt[StringIO, StringIO]): + instruction = "" + input_model = StringIO + output_model = StringIO + + llm = EchoLLM(run_config=RunConfig()) + p = Prompt(llm=llm) + assert p.input_model == StringIO + assert p.output_model == StringIO + + assert p.generate_examples() == "" + + +def test_pydantic_prompt_examples(): + from ragas_experimental.llms.prompt import ( + PydanticPrompt, + ) + + class Prompt(PydanticPrompt[StringIO, StringIO]): + instruction = "" + input_model = StringIO + output_model = StringIO + examples = [ + (StringIO(text="hello"), StringIO(text="hello")), + (StringIO(text="world"), StringIO(text="world")), + ] + + llm = EchoLLM(run_config=RunConfig()) + _ = Prompt(llm=llm) + # assert p.generate_examples() == "hello -> hello\nworld -> world" diff --git a/src/ragas/llms/base.py b/src/ragas/llms/base.py index 93356228e..f0b2b3daa 100644 --- a/src/ragas/llms/base.py +++ b/src/ragas/llms/base.py @@ -4,7 +4,7 @@ import logging import typing as t from abc import ABC, abstractmethod -from dataclasses import dataclass +from dataclasses import dataclass, field from functools import partial from langchain_community.chat_models.vertexai import ChatVertexAI @@ -46,7 +46,7 @@ def is_multiple_completion_supported(llm: BaseLanguageModel) -> bool: @dataclass class BaseRagasLLM(ABC): - run_config: RunConfig + run_config: RunConfig = field(default_factory=RunConfig) def set_run_config(self, run_config: RunConfig): self.run_config = run_config @@ -63,8 +63,7 @@ def generate_text( temperature: float = 1e-8, stop: t.Optional[t.List[str]] = None, callbacks: Callbacks = None, - ) -> LLMResult: - ... + ) -> LLMResult: ... @abstractmethod async def agenerate_text( @@ -74,8 +73,7 @@ async def agenerate_text( temperature: t.Optional[float] = None, stop: t.Optional[t.List[str]] = None, callbacks: Callbacks = None, - ) -> LLMResult: - ... + ) -> LLMResult: ... async def generate( self, diff --git a/src/ragas/metrics/_faithfulness.py b/src/ragas/metrics/_faithfulness.py index b52733429..1ca4b68a3 100644 --- a/src/ragas/metrics/_faithfulness.py +++ b/src/ragas/metrics/_faithfulness.py @@ -209,14 +209,14 @@ def _create_nli_prompt(self, row: t.Dict, statements: t.List[str]) -> PromptValu def _create_statements_prompt(self, row: t.Dict) -> PromptValue: assert self.sentence_segmenter is not None, "sentence_segmenter is not set" - text, question = row["answer"], row["question"] - sentences = self.sentence_segmenter.segment(text) + answer, question = row["answer"], row["question"] + sentences = self.sentence_segmenter.segment(answer) sentences = [ sentence for sentence in sentences if sentence.strip().endswith(".") ] sentences = "\n".join([f"{i}:{x}" for i, x in enumerate(sentences)]) prompt_value = self.statement_prompt.format( - question=question, answer=text, sentences=sentences + question=question, answer=answer, sentences=sentences ) return prompt_value