From b7c1bdc904fa75d4ed6f619082f0eb0a3bfbdb8a Mon Sep 17 00:00:00 2001 From: Emre Demir Date: Sat, 10 Aug 2024 08:26:17 +0200 Subject: [PATCH] Bugfix Summarization Score (#1164) Hi! I was experimenting with Summarization Score and found an issue that when there are no questions generated by the LLM, it throws division by zero error due to 0 generated answers. ## Notable Changes - Added an assertion to keyphrases, questions and answers generation functions. - Changed the llm response variable name `answer` to `response` to prevent possible confusion with score related `answer`. Please let me know if there is something I can add. Cheers! --------- Co-authored-by: jjmachan --- docs/howtos/customisations/run_config.ipynb | 9 ++-- src/ragas/metrics/_summarization.py | 27 ++++++---- src/ragas/metrics/base.py | 9 ++-- src/ragas/testset/prompts.py | 4 +- tests/unit/test_analytics.py | 4 +- tests/unit/test_executor_in_jupyter.ipynb | 6 ++- tests/unit/test_run_config.py | 60 ++++++++++----------- 7 files changed, 68 insertions(+), 51 deletions(-) diff --git a/docs/howtos/customisations/run_config.ipynb b/docs/howtos/customisations/run_config.ipynb index 1db4025cf..bdba0febc 100644 --- a/docs/howtos/customisations/run_config.ipynb +++ b/docs/howtos/customisations/run_config.ipynb @@ -53,15 +53,16 @@ "\n", "# load the dataset\n", "from datasets import load_dataset\n", + "\n", "amnesty_qa = load_dataset(\"explodinggradients/amnesty_qa\", \"english_v2\")\n", "\n", "# configure RunConfig\n", "from ragas.run_config import RunConfig\n", "\n", "_ = evaluate(\n", - " dataset=amnesty_qa[\"eval\"], \n", + " dataset=amnesty_qa[\"eval\"],\n", " metrics=[faithfulness],\n", - " run_config=RunConfig(max_workers=64), # increasing max_workers from default 16\n", + " run_config=RunConfig(max_workers=64), # increasing max_workers from default 16\n", ")" ] }, @@ -94,9 +95,9 @@ ], "source": [ "_ = evaluate(\n", - " dataset=amnesty_qa[\"eval\"], \n", + " dataset=amnesty_qa[\"eval\"],\n", " metrics=[faithfulness],\n", - " run_config=RunConfig(max_workers=2), # increasing max_workers from default 16\n", + " run_config=RunConfig(max_workers=2), # increasing max_workers from default 16\n", ")" ] }, diff --git a/src/ragas/metrics/_summarization.py b/src/ragas/metrics/_summarization.py index ff2691872..885627680 100644 --- a/src/ragas/metrics/_summarization.py +++ b/src/ragas/metrics/_summarization.py @@ -186,7 +186,7 @@ def _compute_score(self, scores) -> float: """Returns average score of the different scores.""" return sum(scores) / len(scores) - def _compute_qa_score(self, answers: t.List) -> float: + def _compute_qa_score(self, answers: t.List[str]) -> float: """Returns a score between 0 and 1 reflecting the fraction of correct answers, ie with a value 'yes' """ @@ -209,10 +209,15 @@ async def _extract_keyphrases(self, text: str, callbacks: Callbacks) -> t.List[s callbacks=callbacks, ) result_text = result.generations[0][0].text - answer = await _output_parser_keyphrase_extraction.aparse( + response = await _output_parser_keyphrase_extraction.aparse( result_text, p_value, self.llm, self.max_retries ) - return answer.keyphrases if answer else [] + + if not response or not response.keyphrases: + logging.error("No keyphrases generated, unable to calculate the score.") + return [] + + return response.keyphrases async def _get_questions( self, text: str, keyphrases: list[str], callbacks: Callbacks @@ -225,13 +230,15 @@ async def _get_questions( ) result_text = result.generations[0][0].text - answer = await _output_parser_question_generation.aparse( + response = await _output_parser_question_generation.aparse( result_text, p_value, self.llm, self.max_retries ) - if answer is None: + + if not response or not response.questions: + logging.error("No questions generated, unable to calculate the score.") return [] - return answer.questions + return response.questions async def _get_answers( self, questions: t.List[str], summary: str, callbacks: Callbacks @@ -244,13 +251,15 @@ async def _get_answers( ) result_text = result.generations[0][0].text - answer = await _output_parser_answer_generation.aparse( + response = await _output_parser_answer_generation.aparse( result_text, p_value, self.llm, self.max_retries ) - if answer is None: + + if not response or not response.answers: + logger.error("No answers generated, unable to calculate the score.") return [] - return answer.answers + return response.answers def adapt(self, language: str, cache_dir: str | None = None) -> None: diff --git a/src/ragas/metrics/base.py b/src/ragas/metrics/base.py index 5587a3656..f619ad12c 100644 --- a/src/ragas/metrics/base.py +++ b/src/ragas/metrics/base.py @@ -63,11 +63,13 @@ def get_required_columns( class Metric(ABC): @property @abstractmethod - def name(self) -> str: ... + def name(self) -> str: + ... @property @abstractmethod - def evaluation_mode(self) -> EvaluationMode: ... + def evaluation_mode(self) -> EvaluationMode: + ... @abstractmethod def init(self, run_config: RunConfig): @@ -130,7 +132,8 @@ async def ascore( return score @abstractmethod - async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: ... + async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: + ... @dataclass diff --git a/src/ragas/testset/prompts.py b/src/ragas/testset/prompts.py index 30f98dd3a..0d2806dda 100644 --- a/src/ragas/testset/prompts.py +++ b/src/ragas/testset/prompts.py @@ -509,5 +509,5 @@ class EvolutionElimination(BaseModel): question_rewrite_prompt, context_scoring_prompt, filter_question_prompt, - evolution_elimination_prompt -] \ No newline at end of file + evolution_elimination_prompt, +] diff --git a/tests/unit/test_analytics.py b/tests/unit/test_analytics.py index bcae8b7e7..0afa9563a 100644 --- a/tests/unit/test_analytics.py +++ b/tests/unit/test_analytics.py @@ -1,5 +1,7 @@ from __future__ import annotations + import typing as t + import pytest @@ -130,7 +132,7 @@ def test_testset_generation_tracking(monkeypatch): def test_was_completed(monkeypatch): - from ragas._analytics import track_was_completed, IsCompleteEvent + from ragas._analytics import IsCompleteEvent, track_was_completed event_properties_list: t.List[IsCompleteEvent] = [] diff --git a/tests/unit/test_executor_in_jupyter.ipynb b/tests/unit/test_executor_in_jupyter.ipynb index 7fc0dca67..73e6ee40c 100644 --- a/tests/unit/test_executor_in_jupyter.ipynb +++ b/tests/unit/test_executor_in_jupyter.ipynb @@ -33,7 +33,7 @@ "\n", "exec = Executor(raise_exceptions=True)\n", "for i in range(10):\n", - " exec.submit(sleep, i/10)\n", + " exec.submit(sleep, i / 10)\n", "\n", "assert exec.results(), \"didn't get anything from results\"" ] @@ -140,6 +140,7 @@ "source": [ "from ragas.metrics.base import Metric, EvaluationMode\n", "\n", + "\n", "class FakeMetric(Metric):\n", " name = \"fake_metric\"\n", " evaluation_mode = EvaluationMode.qa\n", @@ -147,9 +148,10 @@ " def init(self):\n", " pass\n", "\n", - " async def _ascore(self, row, callbacks)->float:\n", + " async def _ascore(self, row, callbacks) -> float:\n", " return 0\n", "\n", + "\n", "fm = FakeMetric()" ] }, diff --git a/tests/unit/test_run_config.py b/tests/unit/test_run_config.py index cdb7b49d7..cc08e262c 100644 --- a/tests/unit/test_run_config.py +++ b/tests/unit/test_run_config.py @@ -1,36 +1,42 @@ -import sys, importlib -from packaging.version import parse as parse_version +import importlib +import sys from platform import python_version + import pytest -from numpy.random import default_rng, Generator +from numpy.random import Generator, default_rng +from packaging.version import parse as parse_version from ragas.run_config import RunConfig if parse_version(python_version()) < parse_version("3.10"): - from typing import NewType, Callable - RandomComparison = NewType("RandomComparison", Callable[[Generator, Generator], bool]) + from typing import Callable, NewType + + RandomComparison = NewType( + "RandomComparison", Callable[[Generator, Generator], bool] + ) elif parse_version(python_version()) >= parse_version("3.10"): - from typing import TypeAlias, Callable + from typing import Callable, TypeAlias + RandomComparison: TypeAlias = Callable[[Generator, Generator], bool] + @pytest.fixture(scope="function") def compare_rng() -> Callable[[Generator, Generator], bool]: - """Pytest fixture wrapper to check :py:cls:`numpy.random.Generator` object equivalence. + """Pytest fixture wrapper to check :py:cls:`numpy.random.Generator` object equivalence.""" - """ - def _compare_rng(rng_0:Generator, rng_1:Generator) -> bool: + def _compare_rng(rng_0: Generator, rng_1: Generator) -> bool: """Compare two :py:cls:`numpy.random.Generator`object. - + Args: rng_0 (numpy.random.Generator) : The first generator to compare with. rng_1 (numpy.random.Generator) : The second generator to compare with. Returns: bool: Whether the two generators are at the same state. - + """ return rng_0.random() == rng_1.random() - + return _compare_rng @@ -39,9 +45,11 @@ def _compare_rng(rng_0:Generator, rng_1:Generator) -> bool: ( [42, True], [None, False], - ) + ), ) -def test_random_num_generator(seed, compare_rng:RandomComparison, expected_equivalence): +def test_random_num_generator( + seed, compare_rng: RandomComparison, expected_equivalence +): """Check :py:mod:`numpy.random` functionality and seed behaviour control.""" rc = RunConfig(seed=seed) @@ -53,7 +61,7 @@ def test_random_num_generator(seed, compare_rng:RandomComparison, expected_equiv assert compare_rng(rc.rng, rng) == expected_equivalence # Check generation consistency - importlib.reload(sys.modules['numpy.random']) + importlib.reload(sys.modules["numpy.random"]) new_rc = RunConfig(seed=seed) new_rng = default_rng(seed=seed) @@ -63,22 +71,14 @@ def test_random_num_generator(seed, compare_rng:RandomComparison, expected_equiv # Check equivalence if expected_equivalence: - assert all( - list( - map( - compare_rng, - [rc.rng, new_rc.rng], - [new_rng, rng] - ) - ) - ) + assert all(list(map(compare_rng, [rc.rng, new_rc.rng], [new_rng, rng]))) else: assert all( - list( - map( - lambda x, y:not compare_rng(x, y), - [rc.rng, new_rc.rng], - [new_rng, rng] - ) + list( + map( + lambda x, y: not compare_rng(x, y), + [rc.rng, new_rc.rng], + [new_rng, rng], ) ) + )