diff --git a/src/helm/benchmark/presentation/summarize.py b/src/helm/benchmark/presentation/summarize.py index ba171a239a..f6dc31b7f3 100644 --- a/src/helm/benchmark/presentation/summarize.py +++ b/src/helm/benchmark/presentation/summarize.py @@ -976,22 +976,20 @@ def _adapter_spec_sort_key(spec): if strategy == AggregationStrategy.WIN_RATE: WIN_RATE_AGGREGATION = "mean" win_rates = compute_aggregate_row_win_rates(table, aggregation=WIN_RATE_AGGREGATION) - description = "How many models this model outperforms on average (over columns)." aggregate_header_cells.append( HeaderCell( f"{WIN_RATE_AGGREGATION.capitalize()} win rate", - description=description, + description="How many models this model outperforms on average (over columns).", lower_is_better=False, ) ) aggregate_row_values.append(win_rates) elif strategy == AggregationStrategy.MEAN: means = compute_aggregate_row_means(table) - description = "An average over columns representing the mean performance." aggregate_header_cells.append( HeaderCell( - "Mean performance", - description=description, + "Mean score", + description="The mean of the scores from all columns.", lower_is_better=table.header[0].lower_is_better, ) ) diff --git a/src/helm/benchmark/static/schema_safety.yaml b/src/helm/benchmark/static/schema_safety.yaml index c35c0b5b53..dd6cf0eec3 100644 --- a/src/helm/benchmark/static/schema_safety.yaml +++ b/src/helm/benchmark/static/schema_safety.yaml @@ -114,7 +114,6 @@ metric_groups: - name: accuracy display_name: Accuracy aggregation_strategies: - - win_rate - mean metrics: - name: ${main_name}