diff --git a/docs/getstarted/rag_testset_generation.ipynb b/docs/getstarted/rag_testset_generation.ipynb index 759dfaa3a..a9d658aed 100644 --- a/docs/getstarted/rag_testset_generation.ipynb +++ b/docs/getstarted/rag_testset_generation.ipynb @@ -100,7 +100,8 @@ "source": [ "from langchain_openai import ChatOpenAI\n", "from ragas.llms.base import LangchainLLMWrapper\n", - "openai_model = LangchainLLMWrapper(ChatOpenAI(model_name=\"gpt-4o\"))\n" + "\n", + "openai_model = LangchainLLMWrapper(ChatOpenAI(model_name=\"gpt-4o\"))" ] }, { diff --git a/src/ragas/experimental/testset/simulators/abstract_qa.py b/src/ragas/experimental/testset/simulators/abstract_qa.py index 34dddc62f..fed1b459a 100644 --- a/src/ragas/experimental/testset/simulators/abstract_qa.py +++ b/src/ragas/experimental/testset/simulators/abstract_qa.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import logging import math import random @@ -24,6 +26,9 @@ Themes, ) +if t.TYPE_CHECKING: + from langchain_core.callbacks import Callbacks + logger = logging.getLogger(__name__) @@ -41,8 +46,8 @@ def __post_init__(self): super().__post_init__() self.common_theme_prompt = CommonThemeFromSummaries() - async def generate_scenarios( - self, n: int, knowledge_graph: KnowledgeGraph + async def _generate_scenarios( + self, n: int, knowledge_graph: KnowledgeGraph, callbacks: Callbacks ) -> t.List[AbstractQuestionScenario]: node_clusters = knowledge_graph.find_clusters( relationship_condition=lambda rel: ( @@ -89,7 +94,7 @@ async def generate_scenarios( summaries=summaries, num_themes=num_themes, ) - kw_list.append({"data": summaries, "llm": self.llm}) + kw_list.append({"data": summaries, "llm": self.llm, "callbacks": callbacks}) themes: t.List[Themes] = run_async_batch( desc="Generating common themes", @@ -129,14 +134,14 @@ async def generate_scenarios( ) return distributions - async def generate_sample( - self, scenario: AbstractQuestionScenario + async def _generate_sample( + self, scenario: AbstractQuestionScenario, callbacks: Callbacks ) -> SingleTurnSample: - user_input = await self.generate_user_input(scenario) - if await self.critic_question(user_input): - user_input = await self.modify_question(user_input, scenario) + user_input = await self.generate_user_input(scenario, callbacks) + if await self.critic_question(user_input, callbacks): + user_input = await self.modify_question(user_input, scenario, callbacks) - reference = await self.generate_answer(user_input, scenario) + reference = await self.generate_answer(user_input, scenario, callbacks) reference_contexts = [] for node in scenario.nodes: @@ -149,13 +154,16 @@ async def generate_sample( reference_contexts=reference_contexts, ) - async def generate_user_input(self, scenario: AbstractQuestionScenario) -> str: + async def generate_user_input( + self, scenario: AbstractQuestionScenario, callbacks: Callbacks + ) -> str: question = await self.generate_user_input_prompt.generate( data=ThemeAndContext( theme=scenario.theme, context=self.make_source_text(scenario), ), llm=self.llm, + callbacks=callbacks, ) return question.text @@ -173,8 +181,8 @@ class ComparativeAbstractQuestionSimulator(QASimulator): default_factory=ComparativeAbstractQuestion ) - async def generate_scenarios( - self, n: int, knowledge_graph: KnowledgeGraph + async def _generate_scenarios( + self, n: int, knowledge_graph: KnowledgeGraph, callbacks: Callbacks ) -> t.List[ComparativeAbstractQuestionScenario]: node_clusters = knowledge_graph.find_clusters( relationship_condition=lambda rel: ( @@ -220,6 +228,7 @@ async def generate_scenarios( num_concepts=num_concepts, ), "llm": self.llm, + "callbacks": callbacks, } ) @@ -264,8 +273,8 @@ async def generate_scenarios( ) return scenarios - async def generate_sample( - self, scenario: ComparativeAbstractQuestionScenario + async def _generate_sample( + self, scenario: ComparativeAbstractQuestionScenario, callbacks: Callbacks ) -> SingleTurnSample: # generate the user input keyphrases = [] @@ -285,15 +294,18 @@ async def generate_sample( summaries=summaries, ), llm=self.llm, + callbacks=callbacks, ) question = question.text # critic the question - if not await self.critic_question(question): - question = await self.modify_question(question, scenario) + if not await self.critic_question(question, callbacks): + question = await self.modify_question(question, scenario, callbacks) # generate the answer - answer = await self.generate_answer(question, scenario, "summary") + answer = await self.generate_answer( + question, scenario, callbacks, reference_property_name="summary" + ) # make the reference contexts # TODO: make this more efficient. Right now we are taking only the summary diff --git a/src/ragas/experimental/testset/simulators/base.py b/src/ragas/experimental/testset/simulators/base.py index cb4934ca8..dd35383eb 100644 --- a/src/ragas/experimental/testset/simulators/base.py +++ b/src/ragas/experimental/testset/simulators/base.py @@ -7,10 +7,13 @@ from pydantic import BaseModel +from ragas.callbacks import new_group from ragas.experimental.testset.graph import KnowledgeGraph, Node from ragas.llms import BaseRagasLLM, llm_factory if t.TYPE_CHECKING: + from langchain_core.callbacks import Callbacks + from ragas.dataset_schema import BaseEvalSample @@ -45,12 +48,48 @@ def __post_init__(self): if not self.name: self.name = self.__class__.__name__ - @abstractmethod async def generate_scenarios( - self, n: int, knowledge_graph: KnowledgeGraph + self, + n: int, + knowledge_graph: KnowledgeGraph, + callbacks: t.Optional[Callbacks] = None, + ) -> t.List[Scenario]: + callbacks = callbacks or [] + scenario_generation_rm, scenario_generation_group = new_group( + name=self.name, + inputs={"n": n, "knowledge_graph": str(knowledge_graph)}, + callbacks=callbacks, + ) + scenarios = await self._generate_scenarios( + n, knowledge_graph, scenario_generation_group + ) + scenario_generation_rm.on_chain_end(outputs={"scenarios": scenarios}) + return scenarios + + @abstractmethod + async def _generate_scenarios( + self, n: int, knowledge_graph: KnowledgeGraph, callbacks: Callbacks ) -> t.List[Scenario]: pass + async def generate_sample( + self, scenario: Scenario, callbacks: t.Optional[Callbacks] = None + ) -> BaseEvalSample: + callbacks = callbacks or [] + + # new group for Sample Generation + sample_generation_rm, sample_generation_grp = new_group( + name=self.name, + inputs={"scenario": scenario}, + callbacks=callbacks, + ) + sample = await self._generate_sample(scenario, sample_generation_grp) + sample_generation_rm.on_chain_end(outputs={"sample": sample}) + + return sample + @abstractmethod - async def generate_sample(self, scenario: Scenario) -> BaseEvalSample: + async def _generate_sample( + self, scenario: Scenario, callbacks: Callbacks + ) -> BaseEvalSample: pass diff --git a/src/ragas/experimental/testset/simulators/base_qa.py b/src/ragas/experimental/testset/simulators/base_qa.py index f283045f7..4b9e051b4 100644 --- a/src/ragas/experimental/testset/simulators/base_qa.py +++ b/src/ragas/experimental/testset/simulators/base_qa.py @@ -1,3 +1,6 @@ +from __future__ import annotations + +import typing as t from dataclasses import dataclass, field from ragas.experimental.prompt import StringIO @@ -13,6 +16,9 @@ extend_modify_input_prompt, ) +if t.TYPE_CHECKING: + from langchain_core.callbacks import Callbacks + @dataclass class QASimulator(BaseSimulator[Scenario]): @@ -22,13 +28,19 @@ class QASimulator(BaseSimulator[Scenario]): ) generate_reference_prompt: PydanticPrompt = field(default_factory=GenerateReference) - async def critic_question(self, question: str) -> bool: + async def critic_question( + self, question: str, callbacks: t.Optional[Callbacks] = None + ) -> bool: + callbacks = callbacks or [] critic = await self.critic_user_input_prompt.generate( - data=StringIO(text=question), llm=self.llm + data=StringIO(text=question), llm=self.llm, callbacks=callbacks ) return critic.independence > 1 and critic.clear_intent > 1 - async def modify_question(self, question: str, scenario: Scenario) -> str: + async def modify_question( + self, question: str, scenario: Scenario, callbacks: t.Optional[Callbacks] = None + ) -> str: + callbacks = callbacks or [] prompt = extend_modify_input_prompt( question_modification_prompt=self.user_input_modification_prompt, style=scenario.style, @@ -41,6 +53,7 @@ async def modify_question(self, question: str, scenario: Scenario) -> str: length=scenario.length, ), llm=self.llm, + callbacks=callbacks, ) return modified_question.text @@ -48,14 +61,17 @@ async def generate_answer( self, question: str, scenario: Scenario, + callbacks: t.Optional[Callbacks] = None, reference_property_name: str = "page_content", ) -> str: + callbacks = callbacks or [] reference = await self.generate_reference_prompt.generate( data=UserInputAndContext( user_input=question, context=self.make_source_text(scenario, reference_property_name), ), llm=self.llm, + callbacks=callbacks, ) return reference.text diff --git a/src/ragas/experimental/testset/simulators/generate.py b/src/ragas/experimental/testset/simulators/generate.py index 605ff1e34..f81529187 100644 --- a/src/ragas/experimental/testset/simulators/generate.py +++ b/src/ragas/experimental/testset/simulators/generate.py @@ -4,6 +4,7 @@ import typing as t from dataclasses import dataclass, field +from ragas.callbacks import new_group from ragas.executor import Executor from ragas.experimental.testset.graph import KnowledgeGraph, Node, NodeType from ragas.experimental.testset.simulators import default_simulator_distribution @@ -18,6 +19,7 @@ from ragas.run_config import RunConfig if t.TYPE_CHECKING: + from langchain_core.callbacks import Callbacks from langchain_core.documents import Document as LCDocument from langchain_core.language_models import BaseLanguageModel as LangchainLLM @@ -25,6 +27,9 @@ from ragas.experimental.testset.simulators.base import BaseScenario +RAGAS_TESTSET_GENERATION_GROUP_NAME = "ragas testset generation" + + @dataclass class TestsetGenerator: llm: BaseRagasLLM @@ -46,6 +51,7 @@ def generate_with_langchain_docs( transforms: t.Optional[Transforms] = None, simulator_distributions: t.Optional[SimulatorDistributions] = None, run_config: t.Optional[RunConfig] = None, + callbacks: t.Optional[Callbacks] = None, with_debugging_logs=False, raise_exceptions: bool = True, ) -> Testset: @@ -73,6 +79,7 @@ def generate_with_langchain_docs( test_size=test_size, simulator_distributions=simulator_distributions, run_config=run_config, + callbacks=callbacks, with_debugging_logs=with_debugging_logs, raise_exceptions=raise_exceptions, ) @@ -81,6 +88,7 @@ def generate( self, test_size: int, simulator_distributions: t.Optional[SimulatorDistributions] = None, + callbacks: t.Optional[Callbacks] = None, run_config: t.Optional[RunConfig] = None, with_debugging_logs=False, raise_exceptions: bool = True, @@ -95,6 +103,9 @@ def generate( simulator_distribution : Optional[SimulatorDistribution], optional A list of tuples containing scenario simulators and their probabilities. If None, default simulators will be used. + callbacks : Optional[Callbacks], optional + Langchain style callbacks to use for the generation process. You can use + this to log the generation process or add other metadata. run_config : Optional[RunConfig], optional Configuration for running the generation process. with_debugging_logs : bool, default False @@ -119,6 +130,14 @@ def generate( simulator_distributions = ( simulator_distributions or default_simulator_distribution(self.llm) ) + callbacks = callbacks or [] + + # new group for Testset Generation + testset_generation_rm, testset_generation_grp = new_group( + name=RAGAS_TESTSET_GENERATION_GROUP_NAME, + inputs={"test_size": test_size}, + callbacks=callbacks, + ) if with_debugging_logs: # TODO: Edit this before pre-release @@ -128,6 +147,16 @@ def generate( patch_logger("ragas.experimental.testset.graph", logging.DEBUG) patch_logger("ragas.experimental.testset.transforms", logging.DEBUG) + splits, _ = calculate_split_values( + [prob for _, prob in simulator_distributions], test_size + ) + # new group for Generation of Scenarios + scenario_generation_rm, scenario_generation_grp = new_group( + name="Scenario Generation", + inputs={"splits": splits}, + callbacks=testset_generation_grp, + ) + # generate scenarios exec = Executor( "Generating Scenarios", @@ -140,10 +169,24 @@ def generate( [prob for _, prob in simulator_distributions], test_size ) for i, (scenario, _) in enumerate(simulator_distributions): - exec.submit(scenario.generate_scenarios, splits[i], self.knowledge_graph) + exec.submit( + scenario.generate_scenarios, + splits[i], + self.knowledge_graph, + scenario_generation_grp, + ) scenario_sample_list: t.List[t.List[BaseScenario]] = exec.results() + scenario_generation_rm.on_chain_end( + outputs={"scenario_sample_list": scenario_sample_list} + ) + # new group for Generation of Samples + sample_generation_rm, sample_generation_grp = new_group( + name="Sample Generation", + inputs={"scenario_sample_list": scenario_sample_list}, + callbacks=testset_generation_grp, + ) exec = Executor( "Generating Samples", raise_exceptions=raise_exceptions, @@ -153,7 +196,7 @@ def generate( additional_testset_info: t.List[t.Dict] = [] for i, (simulator, _) in enumerate(simulator_distributions): for sample in scenario_sample_list[i]: - exec.submit(simulator.generate_sample, sample) + exec.submit(simulator.generate_sample, sample, sample_generation_grp) # fill out the additional info for the TestsetSample additional_testset_info.append( { @@ -162,9 +205,12 @@ def generate( ) eval_samples = exec.results() + sample_generation_rm.on_chain_end(outputs={"eval_samples": eval_samples}) # build the testset testsets = [] for sample, additional_info in zip(eval_samples, additional_testset_info): testsets.append(TestsetSample(eval_sample=sample, **additional_info)) - return Testset(samples=testsets) + testset = Testset(samples=testsets) + testset_generation_rm.on_chain_end({"testset": testset}) + return testset diff --git a/src/ragas/experimental/testset/simulators/specific_qa.py b/src/ragas/experimental/testset/simulators/specific_qa.py index 7aaee6eab..84b2ef43d 100644 --- a/src/ragas/experimental/testset/simulators/specific_qa.py +++ b/src/ragas/experimental/testset/simulators/specific_qa.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import random import typing as t from dataclasses import dataclass, field @@ -10,6 +12,9 @@ from .base_qa import QASimulator from .prompts import SpecificQuestion, SpecificQuestionInput +if t.TYPE_CHECKING: + from langchain_core.callbacks import Callbacks + class SpecificQuestionScenario(BaseScenario): keyphrase: str @@ -19,8 +24,8 @@ class SpecificQuestionScenario(BaseScenario): class SpecificQASimulator(QASimulator): generate_question_prompt: PydanticPrompt = field(default_factory=SpecificQuestion) - async def generate_scenarios( - self, n: int, knowledge_graph: KnowledgeGraph + async def _generate_scenarios( + self, n: int, knowledge_graph: KnowledgeGraph, callbacks: Callbacks ) -> t.List[SpecificQuestionScenario]: # filter out nodes that have keyphrases nodes = [] @@ -59,8 +64,8 @@ async def generate_scenarios( ) return scenarios - async def generate_sample( - self, scenario: SpecificQuestionScenario + async def _generate_sample( + self, scenario: SpecificQuestionScenario, callbacks: Callbacks ) -> SingleTurnSample: question = await self.generate_question_prompt.generate( data=SpecificQuestionInput( @@ -69,13 +74,16 @@ async def generate_sample( text=scenario.nodes[0].get_property("page_content") or "", ), llm=self.llm, + callbacks=callbacks, ) question_text = question.text - if not await self.critic_question(question_text): - question_text = await self.modify_question(question_text, scenario) + if not await self.critic_question(question_text, callbacks): + question_text = await self.modify_question( + question_text, scenario, callbacks + ) - reference = await self.generate_answer(question_text, scenario) + reference = await self.generate_answer(question_text, scenario, callbacks) reference_contexts = [] for node in scenario.nodes: diff --git a/src/ragas/testset/evolutions.py b/src/ragas/testset/evolutions.py index dfec53aa2..9d6c188bd 100644 --- a/src/ragas/testset/evolutions.py +++ b/src/ragas/testset/evolutions.py @@ -234,9 +234,7 @@ async def generate_datarow( ) answer = answer if isinstance(answer, dict) else {} logger.debug("answer generated: %s", answer) - answer = ( - np.nan if answer.get("verdict") == -1 else answer.get("answer", np.nan) - ) + answer = np.nan if answer.get("verdict") == -1 else answer.get("answer", np.nan) return DataRow( question=question.strip('"'),