From e23220bd327307971342901d1a69e353932562de Mon Sep 17 00:00:00 2001 From: Yifan Mai Date: Tue, 16 Jan 2024 16:22:48 -0800 Subject: [PATCH] Remove AdapterSpec from metrics --- src/helm/benchmark/metrics/basic_metrics.py | 74 +++++++------------ .../benchmark/metrics/cleva_harms_metrics.py | 2 - src/helm/benchmark/metrics/code_metrics.py | 7 +- .../benchmark/metrics/copyright_metrics.py | 2 - .../metrics/disinformation_metrics.py | 16 ++-- src/helm/benchmark/metrics/dry_run_metrics.py | 9 +-- .../benchmark/metrics/efficiency_metrics.py | 7 +- .../metrics/evaluate_instances_metric.py | 19 ++--- .../metrics/evaluate_reference_metrics.py | 11 +-- .../image_generation/aesthetics_metrics.py | 2 - .../image_generation/clip_score_metrics.py | 2 - .../denoised_runtime_metric.py | 8 +- .../image_generation/detection_metrics.py | 2 - .../image_generation/efficiency_metrics.py | 2 - .../image_generation/fidelity_metrics.py | 8 +- .../fractal_dimension_metric.py | 2 - .../image_generation/gender_metrics.py | 2 - .../image_critique_metrics.py | 13 +--- .../metrics/image_generation/lpips_metrics.py | 2 - .../multi_scale_ssim_metrics.py | 2 - .../metrics/image_generation/nsfw_metrics.py | 2 - .../image_generation/nudity_metrics.py | 2 - .../photorealism_critique_metrics.py | 11 +-- .../metrics/image_generation/psnr_metrics.py | 2 - .../image_generation/q16_toxicity_metrics.py | 2 - .../image_generation/skin_tone_metrics.py | 2 - .../metrics/image_generation/uiqi_metrics.py | 7 +- .../image_generation/watermark_metrics.py | 2 - .../instruction_following_critique_metrics.py | 2 - .../metrics/language_modeling_metrics.py | 15 ++-- src/helm/benchmark/metrics/metric.py | 73 +++++++++--------- .../benchmark/metrics/numeracy_metrics.py | 2 - src/helm/benchmark/metrics/ranking_metrics.py | 2 - .../metrics/summarization_critique_metrics.py | 2 - .../metrics/summarization_metrics.py | 11 +-- .../benchmark/metrics/toxicity_metrics.py | 2 - src/helm/benchmark/runner.py | 2 +- 37 files changed, 111 insertions(+), 222 deletions(-) diff --git a/src/helm/benchmark/metrics/basic_metrics.py b/src/helm/benchmark/metrics/basic_metrics.py index f4a17dc065..47d96c438e 100644 --- a/src/helm/benchmark/metrics/basic_metrics.py +++ b/src/helm/benchmark/metrics/basic_metrics.py @@ -7,19 +7,13 @@ import numpy as np import scipy import calibration as cal -from helm.benchmark.adaptation.scenario_state import ScenarioState from helm.benchmark.metrics.evaluate_reference_metrics import compute_reference_metrics from helm.benchmark.metrics.efficiency_metrics import EfficiencyMetric from helm.common.hierarchical_logger import hlog from helm.common.request import Token, Sequence -from helm.benchmark.adaptation.adapters.adapter_factory import ( - ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, - ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED, - ADAPT_RANKING_BINARY, -) from helm.benchmark.adaptation.request_state import RequestState -from helm.benchmark.adaptation.adapter_spec import AdapterSpec +from helm.benchmark.metrics.metric import group_request_states_by_train_trial from helm.benchmark.window_services.window_service import WindowService from helm.benchmark.window_services.window_service_factory import WindowServiceFactory from helm.benchmark.window_services.tokenizer_service import TokenizerService @@ -107,20 +101,18 @@ class InstancesPerSplitMetric(MetricInterface): """Report the average num_instances in each MetricContext across train_trials.""" def evaluate( - self, scenario_state: ScenarioState, metric_service: MetricService, eval_cache_path: str, parallelism: int + self, request_states: List[RequestState], metric_service: MetricService, eval_cache_path: str, parallelism: int ) -> MetricResult: - adapter_spec = scenario_state.adapter_spec global_stats: Dict[MetricName, Stat] = {} - for train_trial_index in range(adapter_spec.num_train_trials): + for trial_request_states in group_request_states_by_train_trial(request_states): trial_stats: Dict[MetricName, Stat] = {} # Statistics just for this trial # Group instances in this train_trial by context. instances_per_metric_context: Dict[MetricContext, Set[Instance]] = defaultdict(set) - for request_state in scenario_state.request_states: - if request_state.train_trial_index == train_trial_index: - instances_per_metric_context[MetricContext.from_instance(request_state.instance)].add( - request_state.instance - ) + for request_state in trial_request_states: + instances_per_metric_context[MetricContext.from_instance(request_state.instance)].add( + request_state.instance + ) for context, instance_set in instances_per_metric_context.items(): stat = Stat(MetricName("num_instances")).add(len(instance_set)) merge_stat(trial_stats, add_context(stat, context)) @@ -151,25 +143,23 @@ def __repr__(self): def evaluate_generation( self, - adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService, eval_cache_path: str, ) -> List[Stat]: """Compute all metrics.""" stats: List[Stat] = [] - stats.extend(compute_request_state_metrics(self.efficiency_metric, adapter_spec, request_state, metric_service)) + stats.extend(compute_request_state_metrics(self.efficiency_metric, request_state, metric_service)) if len(request_state.instance.references) > 0: - stats.extend(compute_reference_metrics(self.names, adapter_spec, request_state, metric_service)) + stats.extend(compute_reference_metrics(self.names, request_state, metric_service)) - stats.extend(compute_language_modeling_metrics(adapter_spec, request_state, metric_service)) + stats.extend(compute_language_modeling_metrics(request_state, metric_service)) return stats def evaluate_references( self, - adapter_spec: AdapterSpec, reference_request_states: List[RequestState], metric_service: MetricService, eval_cache_path: str, @@ -218,37 +208,34 @@ def compute_logprob_and_length(request_state: RequestState, window_service: Wind num_choices = len(references) tokenizer_service: TokenizerService = metric_service - window_service: WindowService = WindowServiceFactory.get_window_service( - adapter_spec.model_deployment, tokenizer_service - ) + model_deployment: str = reference_request_states[0].request.model_deployment + window_service: WindowService = WindowServiceFactory.get_window_service(model_deployment, tokenizer_service) reference_stats: Dict[ReferenceKey, ReferenceStat] = {} for request_state in reference_request_states: assert request_state.reference_index is not None and request_state.request_mode is not None reference_key = ReferenceKey(request_state.reference_index, request_state.request_mode) reference_stats[reference_key] = compute_logprob_and_length(request_state, window_service) - if adapter_spec.method in [ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, ADAPT_RANKING_BINARY]: + is_calibrated = any([request_state.request_mode == "calibration" for request_state in reference_request_states]) + + if is_calibrated: reference_scores = [ reference_stats[ReferenceKey(i, "original")].logprob - / reference_stats[ReferenceKey(i, "original")].num_tokens + - reference_stats[ReferenceKey(i, "calibration")].logprob for i in range(num_choices) ] - elif adapter_spec.method == ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED: + else: reference_scores = [ reference_stats[ReferenceKey(i, "original")].logprob - - reference_stats[ReferenceKey(i, "calibration")].logprob + / reference_stats[ReferenceKey(i, "original")].num_tokens for i in range(num_choices) ] - else: - raise ValueError(f"Unknown adapter method: {adapter_spec.method}") stats: List[Stat] = [] general_metrics: Dict[MetricName, Stat] = {} for request_state in reference_request_states: - for stat in compute_request_state_metrics( - self.efficiency_metric, adapter_spec, request_state, metric_service - ): + for stat in compute_request_state_metrics(self.efficiency_metric, request_state, metric_service): merge_stat(general_metrics, stat) stats.extend(general_metrics.values()) max_prob = np.max(scipy.special.softmax(reference_scores)) @@ -284,7 +271,6 @@ def derive_per_instance_stats(self, per_instance_stats: Dict[Instance, List[Stat def compute_request_state_metrics( efficiency_metric: EfficiencyMetric, - adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService, ) -> List[Stat]: @@ -294,20 +280,14 @@ def compute_request_state_metrics( stats: List[Stat] = [] stats.append(Stat(MetricName("num_references")).add(len(request_state.instance.references))) - - # Copy from adapter spec - stats.append(Stat(MetricName("num_train_trials")).add(adapter_spec.num_train_trials)) - - stats.extend(efficiency_metric.compute_efficiency_metrics(adapter_spec, request_state, metric_service)) - stats.extend(_compute_finish_reason_metrics(adapter_spec, request_state, metric_service)) - stats.extend(_compute_truncation_metrics(adapter_spec, request_state, metric_service)) + stats.extend(efficiency_metric.compute_efficiency_metrics(request_state, metric_service)) + stats.extend(_compute_finish_reason_metrics(request_state, metric_service)) + stats.extend(_compute_truncation_metrics(request_state, metric_service)) return stats -def _compute_finish_reason_metrics( - adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService -) -> List[Stat]: +def _compute_finish_reason_metrics(request_state: RequestState, metric_service: MetricService) -> List[Stat]: """Record how often generation finished due to reaching token limit, stop token(s), or end of text""" assert request_state.result is not None sequence = request_state.result.completions[0] @@ -327,9 +307,7 @@ def _compute_finish_reason_metrics( ] -def _compute_truncation_metrics( - adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService -) -> List[Stat]: +def _compute_truncation_metrics(request_state: RequestState, metric_service: MetricService) -> List[Stat]: """ Record the number of training instances used in the prompt and whether even the prompt needed to be truncated (once we hit zero training instances). @@ -340,9 +318,7 @@ def _compute_truncation_metrics( ] -def compute_language_modeling_metrics( - adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService -) -> List[Stat]: +def compute_language_modeling_metrics(request_state: RequestState, metric_service: MetricService) -> List[Stat]: """Compute the logprob and normalization factors for the first completion""" assert request_state.result is not None sequence = request_state.result.completions[0] diff --git a/src/helm/benchmark/metrics/cleva_harms_metrics.py b/src/helm/benchmark/metrics/cleva_harms_metrics.py index ab253edd71..757b0f0747 100644 --- a/src/helm/benchmark/metrics/cleva_harms_metrics.py +++ b/src/helm/benchmark/metrics/cleva_harms_metrics.py @@ -9,7 +9,6 @@ from helm.common.request import RequestResult from helm.common.hierarchical_logger import hlog from helm.benchmark.adaptation.request_state import RequestState -from helm.benchmark.adaptation.adapter_spec import AdapterSpec from helm.benchmark.metrics.cleva_metrics_helper import ChineseTokenizer from helm.proxy.clients.perspective_api_client import PerspectiveAPIClientCredentialsError from helm.common.general import ensure_file_downloaded, ensure_directory_exists @@ -167,7 +166,6 @@ def __repr__(self): def evaluate_generation( self, - adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService, eval_cache_path: str, diff --git a/src/helm/benchmark/metrics/code_metrics.py b/src/helm/benchmark/metrics/code_metrics.py index 01bcabcaa0..c6418d07f9 100644 --- a/src/helm/benchmark/metrics/code_metrics.py +++ b/src/helm/benchmark/metrics/code_metrics.py @@ -6,9 +6,7 @@ from helm.common.hierarchical_logger import hlog from helm.common.request import RequestResult -from helm.benchmark.adaptation.scenario_state import ScenarioState from helm.benchmark.adaptation.request_state import RequestState -from helm.benchmark.adaptation.adapter_spec import AdapterSpec from helm.benchmark.scenarios.code_scenario import CodeReference from . import code_metrics_helper from .metric import Metric, MetricResult @@ -60,17 +58,16 @@ def __init__(self, names, timeout): # resource.setrlimit(resource.RLIMIT_AS, (MAXIMUM_MEMORY_BYTES, MAXIMUM_MEMORY_BYTES)) def evaluate( - self, scenario_state: ScenarioState, metric_service: MetricService, eval_cache_path: str, parallelism: int + self, request_states: List[RequestState], metric_service: MetricService, eval_cache_path: str, parallelism: int ) -> MetricResult: # Running with parallelism > 1 causes the run to get stuck. hlog( f"Setting parallelism from {parallelism} to 1, since evaluating code with parallelism > 1 isn't supported." ) - return super().evaluate(scenario_state, metric_service, eval_cache_path, parallelism=1) + return super().evaluate(request_states, metric_service, eval_cache_path, parallelism=1) def evaluate_generation( self, - adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService, eval_cache_path: str, diff --git a/src/helm/benchmark/metrics/copyright_metrics.py b/src/helm/benchmark/metrics/copyright_metrics.py index 8a47f194b5..065c2c58b4 100644 --- a/src/helm/benchmark/metrics/copyright_metrics.py +++ b/src/helm/benchmark/metrics/copyright_metrics.py @@ -5,7 +5,6 @@ from nltk.tokenize.treebank import TreebankWordTokenizer from helm.benchmark.adaptation.request_state import RequestState -from helm.benchmark.adaptation.adapter_spec import AdapterSpec from helm.benchmark.scenarios.scenario import Reference from helm.common.optional_dependencies import handle_module_not_found_error from helm.common.request import RequestResult @@ -119,7 +118,6 @@ def __init__(self, name: str, normalize_by_prefix_length=False, normalize_newlin def evaluate_generation( self, - adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService, eval_cache_path: str, diff --git a/src/helm/benchmark/metrics/disinformation_metrics.py b/src/helm/benchmark/metrics/disinformation_metrics.py index 5078b23ca3..dfed7dd092 100644 --- a/src/helm/benchmark/metrics/disinformation_metrics.py +++ b/src/helm/benchmark/metrics/disinformation_metrics.py @@ -10,7 +10,6 @@ from helm.common.optional_dependencies import handle_module_not_found_error from helm.common.request import RequestResult, Sequence from helm.benchmark.adaptation.request_state import RequestState -from helm.benchmark.adaptation.adapter_spec import AdapterSpec from .metric import Metric from .metric_name import MetricName from .metric_service import MetricService @@ -76,9 +75,7 @@ def _fetch_human_evaluation_results(eval_cache_path: str, file_name: str) -> Dic return json.load(f) -def _compute_wedging_human_eval( - adapter_spec: AdapterSpec, request_state: RequestState, eval_cache_path: str -) -> List[Stat]: +def _compute_wedging_human_eval(request_state: RequestState, eval_cache_path: str) -> List[Stat]: """ Reads the file with the human evaluation results for the narrative wedging scenario, finds the annotations for the instance currently being evaluated, and outputs the human evaluation metrics for that instance. @@ -86,7 +83,7 @@ def _compute_wedging_human_eval( results: List[Stat] = [] instance_first_line = request_state.instance.input.text.splitlines()[0] human_evaluations = _fetch_human_evaluation_results(eval_cache_path, WEDGING_HUMAN_EVAL_FILE) - model_results = human_evaluations.get(adapter_spec.model_deployment) + model_results = human_evaluations.get(request_state.request.model_deployment) if not model_results: # Trying to evaluate a model we don't have annotations for @@ -115,7 +112,6 @@ def _compute_wedging_human_eval( def _compute_reiteration_human_eval( - adapter_spec: AdapterSpec, request_state: RequestState, eval_cache_path: str, ) -> List[Stat]: @@ -125,7 +121,7 @@ def _compute_reiteration_human_eval( """ results: List[Stat] = [] human_evaluations = _fetch_human_evaluation_results(eval_cache_path, REITERATION_HUMAN_EVAL_FILE) - model_results = human_evaluations.get(adapter_spec.model_deployment) + model_results = human_evaluations.get(request_state.request.model_deployment) if not model_results: # Trying to evaluate a model we don't have annotations for return results @@ -152,7 +148,7 @@ def _compute_reiteration_human_eval( "monte_carlo_entropy": _monte_carlo_entropy, } -human_metric_fns: Dict[str, Callable[[AdapterSpec, RequestState, str], List[Stat]]] = { +human_metric_fns: Dict[str, Callable[[RequestState, str], List[Stat]]] = { "wedging": _compute_wedging_human_eval, "reiteration": _compute_reiteration_human_eval, } @@ -167,7 +163,6 @@ def __init__(self, name): def evaluate_generation( self, - adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService, eval_cache_path: str, @@ -190,10 +185,9 @@ def __init__(self, name): def evaluate_generation( self, - adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService, eval_cache_path: str, ) -> List[Stat]: - metrics = self._metric_fn(adapter_spec, request_state, eval_cache_path) + metrics = self._metric_fn(request_state, eval_cache_path) return metrics diff --git a/src/helm/benchmark/metrics/dry_run_metrics.py b/src/helm/benchmark/metrics/dry_run_metrics.py index 7a1edfa523..ebd17bb005 100644 --- a/src/helm/benchmark/metrics/dry_run_metrics.py +++ b/src/helm/benchmark/metrics/dry_run_metrics.py @@ -3,7 +3,6 @@ from helm.common.general import parallel_map from helm.common.request import Request -from helm.benchmark.adaptation.scenario_state import ScenarioState from helm.benchmark.adaptation.request_state import RequestState from helm.benchmark.metrics.statistic import Stat, merge_stat from helm.benchmark.window_services.window_service import WindowService @@ -58,7 +57,7 @@ def __repr__(self): def evaluate( self, - scenario_state: ScenarioState, + request_states: List[RequestState], metric_service: MetricService, eval_cache_path: str, parallelism: int, @@ -69,7 +68,7 @@ def evaluate( processor = Processor(token_cost_estimator=self.token_cost_estimator, metric_service=metric_service) results: List[List[Stat]] = parallel_map( processor.process, - scenario_state.request_states, + request_states, parallelism=parallelism, ) @@ -81,7 +80,7 @@ def evaluate( request_state.train_trial_index, stats, ) - for request_state, stats in zip(scenario_state.request_states, results) + for request_state, stats in zip(request_states, results) ] # Aggregate @@ -90,6 +89,6 @@ def evaluate( for stat in instance_stats: merge_stat(stats, stat) - merge_stat(stats, Stat(MetricName("num_requests")).add(len(scenario_state.request_states))) + merge_stat(stats, Stat(MetricName("num_requests")).add(len(request_states))) return MetricResult(list(stats.values()), per_instance_stats) diff --git a/src/helm/benchmark/metrics/efficiency_metrics.py b/src/helm/benchmark/metrics/efficiency_metrics.py index 469942fa50..68cf308c8b 100644 --- a/src/helm/benchmark/metrics/efficiency_metrics.py +++ b/src/helm/benchmark/metrics/efficiency_metrics.py @@ -5,7 +5,6 @@ from helm.common.hierarchical_logger import hlog from helm.benchmark.adaptation.request_state import RequestState -from helm.benchmark.adaptation.adapter_spec import AdapterSpec from helm.benchmark.window_services.window_service import WindowService from helm.benchmark.window_services.window_service_factory import WindowServiceFactory from helm.benchmark.window_services.tokenizer_service import TokenizerService @@ -59,9 +58,7 @@ def __init__(self): with data_package.joinpath(TRAINING_EFFICIENCY_JSON_FILENAME).open("r") as f: self.training_efficiency_dict = json.load(f) - def compute_efficiency_metrics( - self, adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService - ) -> List[Stat]: + def compute_efficiency_metrics(self, request_state: RequestState, metric_service: MetricService) -> List[Stat]: """Compute efficiency metrics for both inference and training. For inference, we record both the actual runtime and an estimated idealized runtime for the given request with an optimized software implementation run on A100 GPU(s), @@ -89,7 +86,7 @@ def compute_efficiency_metrics( # and calculate the number of tokens in the prompt. tokenizer_service: TokenizerService = metric_service window_service: WindowService = WindowServiceFactory.get_window_service( - adapter_spec.model_deployment, tokenizer_service + request_state.request.model_deployment, tokenizer_service ) prompt: str = request_state.request.prompt num_prompt_tokens: int = window_service.get_num_tokens(prompt) diff --git a/src/helm/benchmark/metrics/evaluate_instances_metric.py b/src/helm/benchmark/metrics/evaluate_instances_metric.py index 0918bd514e..30be118b05 100644 --- a/src/helm/benchmark/metrics/evaluate_instances_metric.py +++ b/src/helm/benchmark/metrics/evaluate_instances_metric.py @@ -4,8 +4,8 @@ from helm.benchmark.metrics.metric import MetricInterface, MetricResult, add_context -from helm.benchmark.adaptation.scenario_state import ScenarioState from helm.benchmark.adaptation.request_state import RequestState +from helm.benchmark.metrics.metric import group_request_states_by_train_trial from .metric_name import MetricName, MetricContext from .metric_service import MetricService from .statistic import Stat, merge_stat @@ -18,7 +18,7 @@ class EvaluateInstancesMetric(MetricInterface, ABC): """ def evaluate( - self, scenario_state: ScenarioState, metric_service: MetricService, eval_cache_path: str, parallelism: int + self, request_states: List[RequestState], metric_service: MetricService, eval_cache_path: str, parallelism: int ) -> MetricResult: """Aggregate over calls to evaluate_instances, which is defined by the subclass. @@ -26,10 +26,8 @@ def evaluate( 2. For each train trial, take the mean for each Stat. 3. Returns Stats built from those means (e.g. the mean in the result is the mean-of-means). """ - adapter_spec = scenario_state.adapter_spec global_stats: Dict[MetricName, Stat] = {} - - for train_trial_index in range(adapter_spec.num_train_trials): + for trial_request_states in group_request_states_by_train_trial(request_states): # Aggregate these stats trial_stats: Dict[MetricName, Stat] = {} # Statistics just for this trial @@ -37,13 +35,10 @@ def evaluate( # Compute statistics that depend on all the `RequestStates` (e.g., bias metrics). # Aggregate request states and call evaluate_instances in case the metric needs it. grouped_request_states: Dict[MetricContext, List[RequestState]] = defaultdict(list) - for instance in scenario_state.instances: - # TODO: do we need to support reference_index that is not None? - grouped_request_states[MetricContext.from_instance(instance)].extend( - scenario_state.get_request_states(train_trial_index, instance, None) - ) - for context, request_states in grouped_request_states.items(): - for stat in self.evaluate_instances(request_states): + for request_state in trial_request_states: + grouped_request_states[MetricContext.from_instance(request_state.instance)].append(request_state) + for context, request_states_for_context in grouped_request_states.items(): + for stat in self.evaluate_instances(request_states_for_context): merge_stat(trial_stats, add_context(stat, context)) # We take the mean value for each trial. diff --git a/src/helm/benchmark/metrics/evaluate_reference_metrics.py b/src/helm/benchmark/metrics/evaluate_reference_metrics.py index 5862a66f89..54fef57192 100644 --- a/src/helm/benchmark/metrics/evaluate_reference_metrics.py +++ b/src/helm/benchmark/metrics/evaluate_reference_metrics.py @@ -2,7 +2,6 @@ from typing import Callable, Dict, List, Optional, Set, Tuple, cast import numpy as np from functools import partial -from helm.benchmark.adaptation.adapter_spec import AdapterSpec from helm.benchmark.adaptation.request_state import RequestState from helm.benchmark.metrics.cleva_metrics_helper import ChineseTokenizer from helm.benchmark.metrics.metric_name import MetricName @@ -265,7 +264,7 @@ def code_eval(gold: Tuple[str, Optional[Dict]], pred: str) -> float: # TODO This should probably be made into an implementation of MetricInterface. For now it lives here # just to separate it from basic_metrics.py. def compute_reference_metrics( - names: List[str], adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService + names: List[str], request_state: RequestState, metric_service: MetricService ) -> List[Stat]: """ Setup: @@ -281,6 +280,8 @@ def compute_reference_metrics( - ${score}@k: max_{i,j} score(Gi, Pj) """ + num_outputs = max(request_state.request.top_k_per_token, request_state.request.num_completions) + def compute_metrics_helper( name: MetricName, score_func: Callable, @@ -292,7 +293,7 @@ def compute_metrics_helper( results = [score_func((gold.output.text, gold.test_cases), pred) for gold in code_golds for pred in preds] _len, _sum = len(results), int(sum(results)) # Cast to int to make type match. score_1 = pass_at_k_estimator(_len, _sum, 1) - score_k = pass_at_k_estimator(_len, _sum, adapter_spec.num_outputs) + score_k = pass_at_k_estimator(_len, _sum, num_outputs) elif name.name == "code_eval_acc": score_func = cast(Callable[[Tuple[str, Optional[Dict]], str], float], score_func) # Make mypy happy. code_golds = cast(List[CodeReference], golds) @@ -306,8 +307,8 @@ def compute_metrics_helper( score_k = max(score_func(gold.output.text, pred) for gold in golds for pred in preds) metrics = [Stat(name).add(score_1)] # score_1 corresponds using one prediction - if adapter_spec.num_outputs != 1: - metrics.append(Stat(replace(name, name=f"{name.name}@{adapter_spec.num_outputs}")).add(score_k)) + if num_outputs != 1: + metrics.append(Stat(replace(name, name=f"{name.name}@{num_outputs}")).add(score_k)) return metrics # maps each string metric name to its associated function diff --git a/src/helm/benchmark/metrics/image_generation/aesthetics_metrics.py b/src/helm/benchmark/metrics/image_generation/aesthetics_metrics.py index d1f65a7707..5602104731 100644 --- a/src/helm/benchmark/metrics/image_generation/aesthetics_metrics.py +++ b/src/helm/benchmark/metrics/image_generation/aesthetics_metrics.py @@ -4,7 +4,6 @@ from helm.common.images_utils import is_blacked_out_image from helm.common.request import RequestResult from helm.benchmark.adaptation.request_state import RequestState -from helm.benchmark.adaptation.adapter_spec import AdapterSpec from helm.benchmark.metrics.statistic import Stat from helm.benchmark.metrics.metric import Metric from helm.benchmark.metrics.metric_name import MetricName @@ -27,7 +26,6 @@ def __repr__(self): def evaluate_generation( self, - adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService, eval_cache_path: str, diff --git a/src/helm/benchmark/metrics/image_generation/clip_score_metrics.py b/src/helm/benchmark/metrics/image_generation/clip_score_metrics.py index 8e0c57b4aa..ee042e82b7 100644 --- a/src/helm/benchmark/metrics/image_generation/clip_score_metrics.py +++ b/src/helm/benchmark/metrics/image_generation/clip_score_metrics.py @@ -5,7 +5,6 @@ from helm.common.request import RequestResult from helm.common.clip_score_request import DEFAULT_CLIP_SCORE_MODEL, CLIPScoreResult, CLIPScoreRequest from helm.benchmark.adaptation.request_state import RequestState -from helm.benchmark.adaptation.adapter_spec import AdapterSpec from helm.benchmark.metrics.statistic import Stat from helm.benchmark.metrics.metric import Metric from helm.benchmark.metrics.metric_name import MetricName @@ -30,7 +29,6 @@ def __repr__(self): def evaluate_generation( self, - adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService, eval_cache_path: str, diff --git a/src/helm/benchmark/metrics/image_generation/denoised_runtime_metric.py b/src/helm/benchmark/metrics/image_generation/denoised_runtime_metric.py index eb3bece3bc..ceb5f07c43 100644 --- a/src/helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +++ b/src/helm/benchmark/metrics/image_generation/denoised_runtime_metric.py @@ -1,12 +1,12 @@ from collections import defaultdict from tqdm import tqdm -from typing import Dict +from typing import Dict, List import math import numpy as np from helm.common.request import RequestResult +from helm.benchmark.adaptation.request_state import RequestState from helm.benchmark.scenarios.scenario import Instance -from helm.benchmark.adaptation.scenario_state import ScenarioState from helm.benchmark.metrics.statistic import Stat from helm.benchmark.metrics.metric import MetricInterface, MetricResult from helm.benchmark.metrics.metric_name import MetricName @@ -19,14 +19,14 @@ def __repr__(self): def evaluate( self, - scenario_state: ScenarioState, + request_states: List[RequestState], metric_service: MetricService, eval_cache_path: str, parallelism: int, ) -> MetricResult: instance_to_min_request_times: Dict[Instance, float] = defaultdict(lambda: math.inf) - for request_state in tqdm(scenario_state.request_states): + for request_state in tqdm(request_states): assert request_state.result is not None request_result: RequestResult = request_state.result diff --git a/src/helm/benchmark/metrics/image_generation/detection_metrics.py b/src/helm/benchmark/metrics/image_generation/detection_metrics.py index 0b151431fe..347d75c8a4 100644 --- a/src/helm/benchmark/metrics/image_generation/detection_metrics.py +++ b/src/helm/benchmark/metrics/image_generation/detection_metrics.py @@ -4,7 +4,6 @@ from helm.common.request import RequestResult from helm.benchmark.adaptation.request_state import RequestState -from helm.benchmark.adaptation.adapter_spec import AdapterSpec from helm.benchmark.metrics.statistic import Stat from helm.benchmark.metrics.metric import Metric from helm.benchmark.metrics.metric_name import MetricName @@ -28,7 +27,6 @@ def __repr__(self): def evaluate_generation( self, - adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService, eval_cache_path: str, diff --git a/src/helm/benchmark/metrics/image_generation/efficiency_metrics.py b/src/helm/benchmark/metrics/image_generation/efficiency_metrics.py index 36d7345234..fe1dd54a5d 100644 --- a/src/helm/benchmark/metrics/image_generation/efficiency_metrics.py +++ b/src/helm/benchmark/metrics/image_generation/efficiency_metrics.py @@ -2,7 +2,6 @@ from helm.common.request import RequestResult from helm.benchmark.adaptation.request_state import RequestState -from helm.benchmark.adaptation.adapter_spec import AdapterSpec from helm.benchmark.metrics.statistic import Stat from helm.benchmark.metrics.metric import Metric from helm.benchmark.metrics.metric_name import MetricName @@ -20,7 +19,6 @@ def __repr__(self): def evaluate_generation( self, - adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService, eval_cache_path: str, diff --git a/src/helm/benchmark/metrics/image_generation/fidelity_metrics.py b/src/helm/benchmark/metrics/image_generation/fidelity_metrics.py index 3221264891..a3718bdedb 100644 --- a/src/helm/benchmark/metrics/image_generation/fidelity_metrics.py +++ b/src/helm/benchmark/metrics/image_generation/fidelity_metrics.py @@ -3,13 +3,13 @@ import math import os import shutil +from helm.benchmark.adaptation.request_state import RequestState from helm.common.general import ensure_directory_exists, generate_unique_id, get_file_name, hlog from helm.common.gpu_utils import is_cuda_available, get_torch_device from helm.common.request import RequestResult from helm.benchmark.augmentations.perturbation_description import PerturbationDescription from helm.benchmark.scenarios.scenario import Instance -from helm.benchmark.adaptation.scenario_state import ScenarioState from helm.benchmark.metrics.statistic import Stat from helm.benchmark.metrics.metric import MetricInterface, MetricResult from helm.benchmark.metrics.metric_name import MetricName @@ -54,7 +54,7 @@ def __repr__(self): def evaluate( self, - scenario_state: ScenarioState, + request_states: List[RequestState], metric_service: MetricService, eval_cache_path: str, parallelism: int, @@ -74,7 +74,7 @@ def evaluate( # The library requires the gold and generated images to be in two separate directories. # Gather the gold images and the unique perturbations num_gold_images: int = 0 - for request_state in tqdm(scenario_state.request_states): + for request_state in tqdm(request_states): instance: Instance = request_state.instance unique_perturbations.add(instance.perturbation) @@ -100,7 +100,7 @@ def evaluate( ensure_directory_exists(generated_images_path) num_generated_images: int = 0 - for request_state in tqdm(scenario_state.request_states): + for request_state in tqdm(request_states): if request_state.instance.perturbation != perturbation: continue diff --git a/src/helm/benchmark/metrics/image_generation/fractal_dimension_metric.py b/src/helm/benchmark/metrics/image_generation/fractal_dimension_metric.py index 18a9e630a3..faacea0f3e 100644 --- a/src/helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +++ b/src/helm/benchmark/metrics/image_generation/fractal_dimension_metric.py @@ -4,7 +4,6 @@ from helm.common.request import RequestResult from helm.benchmark.adaptation.request_state import RequestState -from helm.benchmark.adaptation.adapter_spec import AdapterSpec from helm.benchmark.metrics.statistic import Stat from helm.benchmark.metrics.metric import Metric from helm.benchmark.metrics.metric_name import MetricName @@ -26,7 +25,6 @@ def __repr__(self): def evaluate_generation( self, - adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService, eval_cache_path: str, diff --git a/src/helm/benchmark/metrics/image_generation/gender_metrics.py b/src/helm/benchmark/metrics/image_generation/gender_metrics.py index a0b0ed801d..96b8a9c75f 100644 --- a/src/helm/benchmark/metrics/image_generation/gender_metrics.py +++ b/src/helm/benchmark/metrics/image_generation/gender_metrics.py @@ -2,7 +2,6 @@ from typing import List from helm.benchmark.adaptation.request_state import RequestState -from helm.benchmark.adaptation.adapter_spec import AdapterSpec from helm.benchmark.metrics.statistic import Stat from helm.benchmark.metrics.metric import Metric from helm.benchmark.metrics.metric_name import MetricName @@ -29,7 +28,6 @@ def __repr__(self): def evaluate_generation( self, - adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService, eval_cache_path: str, diff --git a/src/helm/benchmark/metrics/image_generation/image_critique_metrics.py b/src/helm/benchmark/metrics/image_generation/image_critique_metrics.py index e540df5205..03595b33ae 100644 --- a/src/helm/benchmark/metrics/image_generation/image_critique_metrics.py +++ b/src/helm/benchmark/metrics/image_generation/image_critique_metrics.py @@ -3,8 +3,6 @@ import numpy as np from helm.benchmark.adaptation.request_state import RequestState -from helm.benchmark.adaptation.scenario_state import ScenarioState -from helm.benchmark.adaptation.adapter_spec import AdapterSpec from helm.benchmark.metrics.metric import MetricInterface, MetricResult, PerInstanceStats, add_context from helm.benchmark.metrics.metric_name import MetricContext, MetricName from helm.benchmark.metrics.metric_service import MetricService @@ -92,18 +90,15 @@ def __repr__(self) -> str: def evaluate( self, - scenario_state: ScenarioState, + request_states: List[RequestState], metric_service: MetricService, eval_cache_path: str, parallelism: int, ) -> MetricResult: - request_states: List[RequestState] = [] if self._use_perturbed: - for request_state in scenario_state.request_states: + for request_state in request_states: if request_state.instance.perturbation is not None: request_states.append(request_state) - else: - request_states = scenario_state.request_states np.random.seed(0) if self._num_examples < len(request_states): @@ -120,7 +115,6 @@ def evaluate( for request_state in request_states: context = MetricContext.from_instance(request_state.instance) stats_without_context = self.evaluate_generation( - scenario_state.adapter_spec, request_state, metric_service, eval_cache_path, @@ -141,7 +135,6 @@ def evaluate( def evaluate_generation( self, - adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService, eval_cache_path: str, @@ -169,7 +162,7 @@ def evaluate_generation( prompt = singleton(request_state.instance.contrast_inputs).text # Send the critique request - template: CritiqueTaskTemplate = self._get_critique_template(adapter_spec.model) + template: CritiqueTaskTemplate = self._get_critique_template(request_state.request.model) request = CritiqueRequest(template=template, fields={"prompt": prompt, "image": upload_result.url}) result = metric_service.make_critique_request(request) if not result or not result.responses: diff --git a/src/helm/benchmark/metrics/image_generation/lpips_metrics.py b/src/helm/benchmark/metrics/image_generation/lpips_metrics.py index 11e1b75b5a..8527e89321 100644 --- a/src/helm/benchmark/metrics/image_generation/lpips_metrics.py +++ b/src/helm/benchmark/metrics/image_generation/lpips_metrics.py @@ -8,7 +8,6 @@ from helm.common.optional_dependencies import handle_module_not_found_error from helm.common.request import RequestResult from helm.benchmark.adaptation.request_state import RequestState -from helm.benchmark.adaptation.adapter_spec import AdapterSpec from helm.benchmark.metrics.statistic import Stat from helm.benchmark.metrics.metric import Metric from helm.benchmark.metrics.metric_name import MetricName @@ -36,7 +35,6 @@ def __repr__(self): def evaluate_generation( self, - adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService, eval_cache_path: str, diff --git a/src/helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py b/src/helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py index 71451046c3..14a2875328 100644 --- a/src/helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +++ b/src/helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py @@ -8,7 +8,6 @@ from helm.common.optional_dependencies import handle_module_not_found_error from helm.common.request import RequestResult from helm.benchmark.adaptation.request_state import RequestState -from helm.benchmark.adaptation.adapter_spec import AdapterSpec from helm.benchmark.metrics.statistic import Stat from helm.benchmark.metrics.metric import Metric from helm.benchmark.metrics.metric_name import MetricName @@ -37,7 +36,6 @@ def __repr__(self): def evaluate_generation( self, - adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService, eval_cache_path: str, diff --git a/src/helm/benchmark/metrics/image_generation/nsfw_metrics.py b/src/helm/benchmark/metrics/image_generation/nsfw_metrics.py index 59857689ac..9bc07f9988 100644 --- a/src/helm/benchmark/metrics/image_generation/nsfw_metrics.py +++ b/src/helm/benchmark/metrics/image_generation/nsfw_metrics.py @@ -2,7 +2,6 @@ from helm.common.request import RequestResult from helm.benchmark.adaptation.request_state import RequestState -from helm.benchmark.adaptation.adapter_spec import AdapterSpec from helm.benchmark.metrics.statistic import Stat from helm.benchmark.metrics.metric import Metric from helm.benchmark.metrics.metric_name import MetricName @@ -28,7 +27,6 @@ def __repr__(self): def evaluate_generation( self, - adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService, eval_cache_path: str, diff --git a/src/helm/benchmark/metrics/image_generation/nudity_metrics.py b/src/helm/benchmark/metrics/image_generation/nudity_metrics.py index 39e6af395f..20bd2f3421 100644 --- a/src/helm/benchmark/metrics/image_generation/nudity_metrics.py +++ b/src/helm/benchmark/metrics/image_generation/nudity_metrics.py @@ -3,7 +3,6 @@ from helm.common.nudity_check_request import NudityCheckRequest, NudityCheckResult from helm.common.request import RequestResult from helm.benchmark.adaptation.request_state import RequestState -from helm.benchmark.adaptation.adapter_spec import AdapterSpec from helm.benchmark.metrics.statistic import Stat from helm.benchmark.metrics.metric import Metric from helm.benchmark.metrics.metric_name import MetricName @@ -21,7 +20,6 @@ def __repr__(self): def evaluate_generation( self, - adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService, eval_cache_path: str, diff --git a/src/helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py b/src/helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py index b8a9017d13..cce313a46a 100644 --- a/src/helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +++ b/src/helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py @@ -3,8 +3,6 @@ import numpy as np from helm.benchmark.adaptation.request_state import RequestState -from helm.benchmark.adaptation.scenario_state import ScenarioState -from helm.benchmark.adaptation.adapter_spec import AdapterSpec from helm.benchmark.metrics.metric import MetricInterface, MetricResult, PerInstanceStats, add_context from helm.benchmark.metrics.metric_name import MetricContext, MetricName from helm.benchmark.metrics.metric_service import MetricService @@ -42,18 +40,15 @@ def __repr__(self) -> str: def evaluate( self, - scenario_state: ScenarioState, + request_states: List[RequestState], metric_service: MetricService, eval_cache_path: str, parallelism: int, ) -> MetricResult: - request_states: List[RequestState] = [] if self._use_perturbed: - for request_state in scenario_state.request_states: + for request_state in request_states: if request_state.instance.perturbation is not None: request_states.append(request_state) - else: - request_states = scenario_state.request_states np.random.seed(0) if self._num_examples < len(request_states): @@ -70,7 +65,6 @@ def evaluate( for request_state in request_states: context = MetricContext.from_instance(request_state.instance) stats_without_context = self.evaluate_generation( - scenario_state.adapter_spec, request_state, metric_service, eval_cache_path, @@ -91,7 +85,6 @@ def evaluate( def evaluate_generation( self, - adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService, eval_cache_path: str, diff --git a/src/helm/benchmark/metrics/image_generation/psnr_metrics.py b/src/helm/benchmark/metrics/image_generation/psnr_metrics.py index 01c5862f7b..6e53fb1508 100644 --- a/src/helm/benchmark/metrics/image_generation/psnr_metrics.py +++ b/src/helm/benchmark/metrics/image_generation/psnr_metrics.py @@ -8,7 +8,6 @@ from helm.common.request import RequestResult from helm.common.optional_dependencies import handle_module_not_found_error from helm.benchmark.adaptation.request_state import RequestState -from helm.benchmark.adaptation.adapter_spec import AdapterSpec from helm.benchmark.metrics.statistic import Stat from helm.benchmark.metrics.metric import Metric from helm.benchmark.metrics.metric_name import MetricName @@ -34,7 +33,6 @@ def __repr__(self): def evaluate_generation( self, - adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService, eval_cache_path: str, diff --git a/src/helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py b/src/helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py index a824354509..affa9e47e9 100644 --- a/src/helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +++ b/src/helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py @@ -3,7 +3,6 @@ from helm.common.request import RequestResult from helm.benchmark.adaptation.request_state import RequestState -from helm.benchmark.adaptation.adapter_spec import AdapterSpec from helm.benchmark.metrics.statistic import Stat from helm.benchmark.metrics.metric import Metric from helm.benchmark.metrics.metric_name import MetricName @@ -26,7 +25,6 @@ def __repr__(self): def evaluate_generation( self, - adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService, eval_cache_path: str, diff --git a/src/helm/benchmark/metrics/image_generation/skin_tone_metrics.py b/src/helm/benchmark/metrics/image_generation/skin_tone_metrics.py index 85ee280e0a..11e2f2e182 100644 --- a/src/helm/benchmark/metrics/image_generation/skin_tone_metrics.py +++ b/src/helm/benchmark/metrics/image_generation/skin_tone_metrics.py @@ -3,7 +3,6 @@ import numpy as np from helm.benchmark.adaptation.request_state import RequestState -from helm.benchmark.adaptation.adapter_spec import AdapterSpec from helm.benchmark.metrics.statistic import Stat from helm.benchmark.metrics.metric import Metric from helm.benchmark.metrics.metric_name import MetricName @@ -127,7 +126,6 @@ def __repr__(self): def evaluate_generation( self, - adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService, eval_cache_path: str, diff --git a/src/helm/benchmark/metrics/image_generation/uiqi_metrics.py b/src/helm/benchmark/metrics/image_generation/uiqi_metrics.py index 13480489c0..dd7fe6ce21 100644 --- a/src/helm/benchmark/metrics/image_generation/uiqi_metrics.py +++ b/src/helm/benchmark/metrics/image_generation/uiqi_metrics.py @@ -10,8 +10,6 @@ from helm.common.optional_dependencies import handle_module_not_found_error from helm.common.request import RequestResult from helm.benchmark.adaptation.request_state import RequestState -from helm.benchmark.adaptation.scenario_state import ScenarioState -from helm.benchmark.adaptation.adapter_spec import AdapterSpec from helm.benchmark.metrics.statistic import Stat from helm.benchmark.metrics.metric import Metric from helm.benchmark.metrics.metric import MetricResult @@ -39,14 +37,13 @@ def __repr__(self): return "UniversalImageQualityIndexMetric()" def evaluate( - self, scenario_state: ScenarioState, metric_service: MetricService, eval_cache_path: str, parallelism: int + self, request_states: List[RequestState], metric_service: MetricService, eval_cache_path: str, parallelism: int ) -> MetricResult: hlog(f"Setting parallelism from {parallelism} to 1, since computing UIQI with parallelism > 1 isn't supported.") - return super().evaluate(scenario_state, metric_service, eval_cache_path, parallelism=1) + return super().evaluate(request_states, metric_service, eval_cache_path, parallelism=1) def evaluate_generation( self, - adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService, eval_cache_path: str, diff --git a/src/helm/benchmark/metrics/image_generation/watermark_metrics.py b/src/helm/benchmark/metrics/image_generation/watermark_metrics.py index aa63c452b3..2749dce971 100644 --- a/src/helm/benchmark/metrics/image_generation/watermark_metrics.py +++ b/src/helm/benchmark/metrics/image_generation/watermark_metrics.py @@ -3,7 +3,6 @@ from helm.common.request import RequestResult from helm.benchmark.adaptation.request_state import RequestState -from helm.benchmark.adaptation.adapter_spec import AdapterSpec from helm.benchmark.metrics.statistic import Stat from helm.benchmark.metrics.metric import Metric from helm.benchmark.metrics.metric_name import MetricName @@ -26,7 +25,6 @@ def __repr__(self): def evaluate_generation( self, - adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService, eval_cache_path: str, diff --git a/src/helm/benchmark/metrics/instruction_following_critique_metrics.py b/src/helm/benchmark/metrics/instruction_following_critique_metrics.py index c3e506a072..08777284b4 100644 --- a/src/helm/benchmark/metrics/instruction_following_critique_metrics.py +++ b/src/helm/benchmark/metrics/instruction_following_critique_metrics.py @@ -1,7 +1,6 @@ from typing import Dict, List from helm.benchmark.adaptation.request_state import RequestState -from helm.benchmark.adaptation.adapter_spec import AdapterSpec from .metric import Metric from .metric_name import MetricName from .metric_service import MetricService @@ -143,7 +142,6 @@ def __repr__(self) -> str: def evaluate_generation( self, - adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService, eval_cache_path: str, diff --git a/src/helm/benchmark/metrics/language_modeling_metrics.py b/src/helm/benchmark/metrics/language_modeling_metrics.py index 84ea133ecb..77a1c9f61d 100644 --- a/src/helm/benchmark/metrics/language_modeling_metrics.py +++ b/src/helm/benchmark/metrics/language_modeling_metrics.py @@ -1,7 +1,6 @@ from collections import defaultdict from typing import List, Dict, Set -from helm.benchmark.adaptation.scenario_state import ScenarioState from helm.benchmark.metrics.basic_metrics import ( compute_language_modeling_metrics, compute_perplexity_metrics, @@ -10,7 +9,6 @@ from helm.benchmark.metrics.efficiency_metrics import EfficiencyMetric from helm.benchmark.adaptation.request_state import RequestState -from helm.benchmark.adaptation.adapter_spec import AdapterSpec from .metric import MetricInterface, MetricResult, PerInstanceStats, add_context from .metric_name import MetricContext, MetricName from .metric_service import MetricService @@ -31,7 +29,7 @@ def __repr__(self): return "LanguageModelingMetric" def evaluate( - self, scenario_state: ScenarioState, metric_service: MetricService, eval_cache_path: str, parallelism: int + self, request_states: List[RequestState], metric_service: MetricService, eval_cache_path: str, parallelism: int ) -> MetricResult: global_stats: Dict[MetricName, Stat] = {} # The first and only trial @@ -40,11 +38,9 @@ def evaluate( all_per_instance_stats: List[PerInstanceStats] = [] instance_ids_per_context: Dict[MetricContext, Set[str]] = defaultdict(set) - for request_state in scenario_state.request_states: + for request_state in request_states: # Evaluate request_state - request_stats = self.evaluate_generation( - scenario_state.adapter_spec, request_state, metric_service, eval_cache_path - ) + request_stats = self.evaluate_generation(request_state, metric_service, eval_cache_path) # Add instance-related context (e.g., split, perturbation) to the metrics for i, stat in enumerate(request_stats): @@ -80,15 +76,14 @@ def evaluate( def evaluate_generation( self, - adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService, eval_cache_path: str, ) -> List[Stat]: """Compute all metrics.""" stats: List[Stat] = [] - stats.extend(compute_request_state_metrics(self.efficiency_metric, adapter_spec, request_state, metric_service)) - stats.extend(compute_language_modeling_metrics(adapter_spec, request_state, metric_service)) + stats.extend(compute_request_state_metrics(self.efficiency_metric, request_state, metric_service)) + stats.extend(compute_language_modeling_metrics(request_state, metric_service)) return stats diff --git a/src/helm/benchmark/metrics/metric.py b/src/helm/benchmark/metrics/metric.py index 7d41eab707..2587c20480 100644 --- a/src/helm/benchmark/metrics/metric.py +++ b/src/helm/benchmark/metrics/metric.py @@ -10,10 +10,7 @@ PERTURBATION_ORIGINAL, PERTURBATION_WORST, ) -from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_LANGUAGE_MODELING -from helm.benchmark.adaptation.scenario_state import ScenarioState from helm.benchmark.adaptation.request_state import RequestState -from helm.benchmark.adaptation.adapter_spec import AdapterSpec from helm.benchmark.scenarios.scenario import Instance from .metric_name import MetricName, MetricContext from .metric_service import MetricService @@ -65,7 +62,6 @@ class Processor: metric: "Metric" metric_service: MetricService eval_cache_path: str - adapter_spec: AdapterSpec def process(self, request_state_set: RequestStateSet) -> List[Stat]: instance_stats: List[Stat] = [] @@ -74,18 +70,14 @@ def process(self, request_state_set: RequestStateSet) -> List[Stat]: generation_states = request_state_set.generation_states if len(generation_states) != 0: instance_stats.extend( - self.metric.evaluate_generation( - self.adapter_spec, singleton(generation_states), self.metric_service, self.eval_cache_path - ) + self.metric.evaluate_generation(singleton(generation_states), self.metric_service, self.eval_cache_path) ) # Evaluate the references references_states = request_state_set.references_states if len(references_states) != 0: instance_stats.extend( - self.metric.evaluate_references( - self.adapter_spec, references_states, self.metric_service, self.eval_cache_path - ) + self.metric.evaluate_references(references_states, self.metric_service, self.eval_cache_path) ) # Add instance-related context (e.g., split, perturbation) to the metrics @@ -100,7 +92,7 @@ class MetricInterface(ABC): @abstractmethod def evaluate( - self, scenario_state: ScenarioState, metric_service: MetricService, eval_cache_path: str, parallelism: int + self, request_states: List[RequestState], metric_service: MetricService, eval_cache_path: str, parallelism: int ) -> MetricResult: pass @@ -116,7 +108,7 @@ class Metric(MetricInterface, ABC): """ def evaluate( - self, scenario_state: ScenarioState, metric_service: MetricService, eval_cache_path: str, parallelism: int + self, request_states: List[RequestState], metric_service: MetricService, eval_cache_path: str, parallelism: int ) -> MetricResult: """ Main entry point for a `Metric`. This function groups the single @@ -126,37 +118,28 @@ def evaluate( Any logic that doesn't decompose along instances should go here, such as robustness. """ - assert scenario_state.adapter_spec.method != ADAPT_LANGUAGE_MODELING, ( - "Metric no longer knows how to handle the language modeling adapter. " - + "All run_specs with that adapter should use LanguageModelingMetric. " - + "If you are seeing this issue, please file a Github issue." - ) - - adapter_spec = scenario_state.adapter_spec global_stats: Dict[MetricName, Stat] = {} all_per_instance_stats: List[PerInstanceStats] = [] - for train_trial_index in range(adapter_spec.num_train_trials): + for train_trial_index, trial_request_states in enumerate(group_request_states_by_train_trial(request_states)): # Construct inputs - request_state_sets: List[RequestStateSet] = [] - for instance in scenario_state.instances: - generation_states = scenario_state.get_request_states(train_trial_index, instance, None) - references_states = [] - for reference_index in range(len(instance.references)): - references_states.extend( - scenario_state.get_request_states(train_trial_index, instance, reference_index) - ) - request_state_set = RequestStateSet( - instance=instance, generation_states=generation_states, references_states=references_states - ) - request_state_sets.append(request_state_set) + instance_to_request_state_set: Dict[Instance, RequestStateSet] = {} + for request_state in trial_request_states: + instance: Instance = request_state.instance + if instance not in instance_to_request_state_set: + instance_to_request_state_set[instance] = RequestStateSet(instance, [], []) + if request_state.reference_index is None: + instance_to_request_state_set[instance].generation_states.append(request_state) + else: + instance_to_request_state_set[instance].references_states.append(request_state) + request_state_sets: List[RequestStateSet] = list(instance_to_request_state_set.values()) + instances: List[Instance] = list(instance_to_request_state_set.keys()) # Do it! processor = Processor( metric=self, metric_service=metric_service, eval_cache_path=eval_cache_path, - adapter_spec=scenario_state.adapter_spec, ) results: List[List[Stat]] = parallel_map( processor.process, @@ -166,7 +149,7 @@ def evaluate( # Compute per-instance stats per_instance_stats: List[PerInstanceStats] = [] - for instance, stats in zip(scenario_state.instances, results): + for instance, stats in zip(instances, results): assert instance.id is not None, f"id was none for instance: {instance}" # Sometimes a metric (e.g., BiasMetric) doesn't produce any statistics if len(stats) > 0: @@ -198,7 +181,7 @@ def evaluate( grouped_per_instance_stats: Dict[MetricContext, Dict[Instance, List[Stat]]] = defaultdict( lambda: defaultdict(list) ) - for instance, stats in zip(scenario_state.instances, results): + for instance, stats in zip(instances, results): for stat in stats: grouped_per_instance_stats[MetricContext.from_instance(instance)][instance].append(stat) for context, instance_dict in grouped_per_instance_stats.items(): @@ -210,7 +193,7 @@ def evaluate( # Compute worst-case metrics. # This is here since we want these stats for all metrics and they # aggregate across contexts (perturbations). - worst_case_stats = self.compute_worst_case_metrics(dict(zip(scenario_state.instances, results))) + worst_case_stats = self.compute_worst_case_metrics(dict(zip(instances, results))) for stat in worst_case_stats: merge_stat(trial_stats, stat) @@ -225,7 +208,6 @@ def evaluate( def evaluate_generation( self, - adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService, eval_cache_path: str, @@ -235,7 +217,6 @@ def evaluate_generation( def evaluate_references( self, - adapter_spec: AdapterSpec, reference_request_states: List[RequestState], metric_service: MetricService, eval_cache_path: str, @@ -352,3 +333,19 @@ def add_context(stat: Stat, context: MetricContext) -> Stat: return Stat( replace(stat.name, split=context.split, sub_split=context.sub_split, perturbation=context.perturbation) ).merge(stat) + + +def get_num_train_trials(request_states: List[RequestState]) -> int: + """Return the number of train trials.""" + return max([request_state.train_trial_index for request_state in request_states]) + 1 + + +def group_request_states_by_train_trial(request_states: List[RequestState]) -> List[List[RequestState]]: + """Groups RequestStates by train trial index.""" + grouped_request_states: List[List[RequestState]] = [] + for request_state in request_states: + train_trial_index = request_state.train_trial_index + while len(grouped_request_states) < train_trial_index + 1: + grouped_request_states.append([]) + grouped_request_states[train_trial_index].append(request_state) + return grouped_request_states diff --git a/src/helm/benchmark/metrics/numeracy_metrics.py b/src/helm/benchmark/metrics/numeracy_metrics.py index 5f87764f65..e7bea85ce1 100644 --- a/src/helm/benchmark/metrics/numeracy_metrics.py +++ b/src/helm/benchmark/metrics/numeracy_metrics.py @@ -2,7 +2,6 @@ from helm.common.request import RequestResult from helm.benchmark.adaptation.request_state import RequestState -from helm.benchmark.adaptation.adapter_spec import AdapterSpec from helm.benchmark.scenarios.numeracy_scenario import ( # noqa NumeracyScenario, Polynomial, @@ -35,7 +34,6 @@ class DistanceMetric(Metric): def evaluate_generation( self, - adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService, eval_cache_path: str, diff --git a/src/helm/benchmark/metrics/ranking_metrics.py b/src/helm/benchmark/metrics/ranking_metrics.py index b7c823eca5..2da686c6d1 100644 --- a/src/helm/benchmark/metrics/ranking_metrics.py +++ b/src/helm/benchmark/metrics/ranking_metrics.py @@ -3,7 +3,6 @@ from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_RANKING_BINARY from helm.benchmark.adaptation.request_state import RequestState -from helm.benchmark.adaptation.adapter_spec import AdapterSpec from helm.common.optional_dependencies import handle_module_not_found_error from helm.benchmark.scenarios.scenario import unpack_tag, CORRECT_TAG, Reference from helm.common.request import RequestResult @@ -339,7 +338,6 @@ def compute_measures( def evaluate_references( self, - adapter_spec: AdapterSpec, reference_request_states: List[RequestState], metric_service: MetricService, eval_cache_path: str, diff --git a/src/helm/benchmark/metrics/summarization_critique_metrics.py b/src/helm/benchmark/metrics/summarization_critique_metrics.py index ad7a2a95d4..e8c56cf9e1 100644 --- a/src/helm/benchmark/metrics/summarization_critique_metrics.py +++ b/src/helm/benchmark/metrics/summarization_critique_metrics.py @@ -1,7 +1,6 @@ from typing import Dict, List from helm.benchmark.adaptation.request_state import RequestState -from helm.benchmark.adaptation.adapter_spec import AdapterSpec from .metric import Metric from .metric_name import MetricName from .metric_service import MetricService @@ -63,7 +62,6 @@ def __repr__(self) -> str: def evaluate_generation( self, - adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService, eval_cache_path: str, diff --git a/src/helm/benchmark/metrics/summarization_metrics.py b/src/helm/benchmark/metrics/summarization_metrics.py index 45c0c9f346..92649b5489 100644 --- a/src/helm/benchmark/metrics/summarization_metrics.py +++ b/src/helm/benchmark/metrics/summarization_metrics.py @@ -9,9 +9,7 @@ from typing import List, Dict, Optional from collections import defaultdict -from helm.benchmark.adaptation.scenario_state import ScenarioState from helm.benchmark.adaptation.request_state import RequestState -from helm.benchmark.adaptation.adapter_spec import AdapterSpec from helm.benchmark.metrics.evaluate_reference_metrics import get_rouge_function from helm.common.hierarchical_logger import hlog from helm.common.general import ensure_file_downloaded @@ -118,7 +116,7 @@ def _load_humaneval(self, eval_cache_path: str) -> Dict: return all_humaneval_scores def evaluate( - self, scenario_state: ScenarioState, metric_service: MetricService, eval_cache_path: str, parallelism: int + self, request_states: List[RequestState], metric_service: MetricService, eval_cache_path: str, parallelism: int ) -> MetricResult: if self.compute_faithfulness: # When running with a GPU and parallelism > 1, errors with "...in layer_norm @@ -130,7 +128,7 @@ def evaluate( ) parallelism = 1 - return super().evaluate(scenario_state, metric_service, eval_cache_path, parallelism=parallelism) + return super().evaluate(request_states, metric_service, eval_cache_path, parallelism=parallelism) def _compute_rouge(self, refs: List[str], pred: str) -> Dict[str, float]: metrics: Dict[str, float] = {} @@ -164,7 +162,6 @@ def _remove_braces(self, text: str) -> str: def evaluate_generation( self, - adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService, eval_cache_path: str, @@ -182,7 +179,7 @@ def evaluate_generation( self.humaneval = self._load_humaneval(eval_cache_path) # get human evaluation scores if they exist - deployment = adapter_spec.model_deployment.replace("/", "_") + deployment = request_state.request.model_deployment.replace("/", "_") for metric_name in ["faithfulness", "relevance", "coherence"]: val = self.humaneval[(metric_name, deployment, request_state.instance.id, pred)] result.append(Stat(MetricName(f"HumanEval-{metric_name}")).add(float(val))) @@ -196,7 +193,7 @@ def evaluate_generation( if self.qa_fact_eval is None: self._load_qafacteval(eval_cache_path) assert self.qa_fact_eval is not None - deployment = adapter_spec.model_deployment.replace("/", "_") + deployment = request_state.request.model_deployment.replace("/", "_") val = self.qa_fact_eval[deployment][(request_state.instance.id, pred)] result.append(Stat(MetricName("QAFactEval")).add(float(val))) except KeyError: diff --git a/src/helm/benchmark/metrics/toxicity_metrics.py b/src/helm/benchmark/metrics/toxicity_metrics.py index 6808875da9..993d3a99d1 100644 --- a/src/helm/benchmark/metrics/toxicity_metrics.py +++ b/src/helm/benchmark/metrics/toxicity_metrics.py @@ -4,7 +4,6 @@ from helm.common.request import RequestResult from helm.common.hierarchical_logger import hlog from helm.benchmark.adaptation.request_state import RequestState -from helm.benchmark.adaptation.adapter_spec import AdapterSpec from helm.proxy.clients.perspective_api_client import PerspectiveAPIClientCredentialsError from .metric import Metric from .metric_name import MetricName @@ -28,7 +27,6 @@ def __repr__(self): def evaluate_generation( self, - adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService, eval_cache_path: str, diff --git a/src/helm/benchmark/runner.py b/src/helm/benchmark/runner.py index 6408a8fe34..28d569b507 100644 --- a/src/helm/benchmark/runner.py +++ b/src/helm/benchmark/runner.py @@ -319,7 +319,7 @@ def run_one(self, run_spec: RunSpec): for metric in metrics: with htrack_block(metric): metric_result: MetricResult = metric.evaluate( - scenario_state, + scenario_state.request_states, self.metric_service, self.eval_cache_path, self.executor.execution_spec.parallelism,