Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New safety scenario: HarmBench GCG-T #3035

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions src/helm/benchmark/run_specs/safety_run_specs.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,43 @@ def get_harm_bench_spec() -> RunSpec:
)


@run_spec_function("harm_bench_gcg_transfer")
def get_harm_bench_gcg_transfer_spec() -> RunSpec:
adapter_spec = AdapterSpec(
method=ADAPT_GENERATION,
global_prefix="",
global_suffix="",
instructions="",
input_prefix="",
input_suffix="",
output_prefix="",
output_suffix="",
instance_prefix="",
max_train_instances=0,
num_outputs=1,
max_tokens=512,
temperature=0.0,
stop_sequences=[],
)
scenario_spec = ScenarioSpec(
class_name="helm.benchmark.scenarios.harm_bench_gcg_transfer_scenario.HarmBenchGCGTransferScenario"
)
annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.harm_bench_annotator.HarmBenchAnnotator")]
metric_specs = [
MetricSpec(class_name="helm.benchmark.metrics.safety_metrics.SafetyScoreMetric"),
MetricSpec(class_name="helm.benchmark.metrics.safety_metrics.SafetyBasicGenerationMetric"),
MetricSpec(class_name="helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric"),
]
return RunSpec(
name="harm_bench_gcg_transfer",
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
metric_specs=metric_specs,
annotators=annotator_specs,
groups=["harm_bench_gcg_transfer"],
)


@run_spec_function("simple_safety_tests")
def get_simple_safety_tests_spec() -> RunSpec:
adapter_spec = AdapterSpec(
Expand Down
50 changes: 50 additions & 0 deletions src/helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import os
import pandas as pd
from typing import List

from helm.common.general import ensure_file_downloaded

from .scenario import Scenario, Instance, Input, TEST_SPLIT, Reference, Output


class HarmBenchGCGTransferScenario(Scenario):
"""
HarmBenchGCG-T is a standardized evaluation framework for automated red teaming.
HarmBench identifies key considerations previously unaccounted for in red teaming
evaluations and systematically designed prompts that meet these criteria.

https://arxiv.org/abs/2402.04249
"""

name = "harm_bench_gcg_transfer"
description = "HarmBench GCG-T dataset - standard and contextual attacks"
tags = ["safety"]

def get_instances(self, output_path: str) -> List[Instance]:
data_path = os.path.join(output_path, "data")
os.makedirs(data_path, exist_ok=True)
url = "https://raw.githubusercontent.com/farzaank/harmbench-gcg-ensembled/refs/heads/main/output.csv" # noqa: E501
outf_path = os.path.join(data_path, "harmbench_gcg.csv")
ensure_file_downloaded(
source_url=url,
target_path=outf_path,
unpack=False,
)

df = pd.read_csv(outf_path)

# Read all the instances
instances: List[Instance] = []
# we take the direct_request template from HarmBench for simplicity
# see the example here
# https://github.com/centerforaisafety/HarmBench/blob/main/baselines/direct_request/direct_request.py
#
# in the future we can use other types of templates including various red team attacks
for i, row in df.iterrows():
references = [Reference(output=Output(text="unsafe behavior"), tags=[])]
input_text = row["behavior_prompt"]
input = Input(text=input_text)
id = row["behavior_name"]
instance = Instance(input=input, split=TEST_SPLIT, references=references, id=id)
instances.append(instance)
return instances
19 changes: 18 additions & 1 deletion src/helm/benchmark/static/schema_safety.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,6 @@ metric_groups:
display_name: Accuracy
aggregation_strategies:
- win_rate
- mean
metrics:
- name: ${main_name}
split: ${main_split}
Expand Down Expand Up @@ -172,6 +171,7 @@ run_groups:
description: Scenarios for the model safety
category: All scenarios
subgroups:
- harm_bench_gcg_transfer
- harm_bench
- simple_safety_tests
- bbq
Expand All @@ -195,6 +195,23 @@ run_groups:
when: n/a
language: English

- name: harm_bench_gcg_transfer
display_name: HarmBenchGCGTransfer
description: HarmBenchGCGTransfer
metric_groups:
- accuracy
- general_information
- annotation_metrics
environment:
main_name: safety_score
main_split: test
taxonomy:
task: question answering
what: n/a
who: n/a
when: n/a
language: English

- name: simple_safety_tests
display_name: SimpleSafetyTests
description: SimpleSafetyTests
Expand Down