diff --git a/MANIFEST.in b/MANIFEST.in
index be7eb2985a5..57c013ae678 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -4,3 +4,4 @@ recursive-include src/helm/benchmark/ *.json
recursive-include src/helm/benchmark/static/ *.css *.html *.js *.png *.yaml
recursive-include src/helm/benchmark/static_build/ *.css *.html *.js *.png *.yaml
recursive-include src/helm/config/ *.yaml
+recursive-include src/helm/benchmark/annotation/omni_math/ *.txt
diff --git a/setup.cfg b/setup.cfg
index e39e424220d..6d11cbbafcb 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -81,6 +81,7 @@ metrics =
sacrebleu~=2.2.1 # For disinformation_metrics, machine_translation_metrics
langdetect~=1.0.9 # For ifeval_metrics
immutabledict~=4.2.0 # For ifeval_metrics
+ gradio_client~=1.3 # For bigcodebench_metrics
summarization =
summ-eval~=0.892 # For summarization_metrics
diff --git a/src/helm/benchmark/annotation/annotator.py b/src/helm/benchmark/annotation/annotator.py
index 4dea4b1cc29..fe835fb3eeb 100644
--- a/src/helm/benchmark/annotation/annotator.py
+++ b/src/helm/benchmark/annotation/annotator.py
@@ -20,6 +20,11 @@ def annotate(self, request_state: RequestState) -> Any:
that are implementation specific."""
pass
+ def annotate_all(self, request_states: List[RequestState]) -> List[Dict[str, Any]]:
+ """Fills the annotations field of all request states with additional information
+ that are implementation specific."""
+ return [self.annotate(request_state) for request_state in request_states]
+
@dataclass(frozen=True)
class AnnotatorSpec(ObjectSpec):
diff --git a/src/helm/benchmark/annotation/bigcodebench_annotator.py b/src/helm/benchmark/annotation/bigcodebench_annotator.py
new file mode 100644
index 00000000000..f8ed7b76f84
--- /dev/null
+++ b/src/helm/benchmark/annotation/bigcodebench_annotator.py
@@ -0,0 +1,109 @@
+import ast
+import traceback
+import json
+
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.annotation.annotator import Annotator
+from helm.common.hierarchical_logger import hlog
+
+from typing import Any, List, Dict
+from gradio_client import Client, handle_file
+from tempfile import TemporaryDirectory
+from retrying import retry
+
+
+OUTPUT_FILENAME = "tmp_result.jsonl"
+
+
+def syntax_check(code, verbose=False):
+ try:
+ ast.parse(code)
+ return True
+ except (SyntaxError, MemoryError):
+ if verbose:
+ traceback.print_exc()
+ return False
+
+
+def code_extract(text: str) -> str:
+ lines = text.split("\n")
+ longest_line_pair = (0, 0)
+ longest_so_far = 0
+
+ for i in range(len(lines)):
+ for j in range(i + 1, len(lines)):
+ current_lines = "\n".join(lines[i : j + 1])
+ if syntax_check(current_lines):
+ current_length = sum(1 for line in lines[i : j + 1] if line.strip())
+ if current_length > longest_so_far:
+ longest_so_far = current_length
+ longest_line_pair = (i, j)
+
+ return "\n".join(lines[longest_line_pair[0] : longest_line_pair[1] + 1])
+
+
+class BigCodeBenchAnnotator(Annotator):
+ """The BigCodeBench autograder."""
+
+ name = "bigcodebench"
+
+ def __init__(self):
+ self.remote_execute_api = "https://bigcode-bigcodebench-evaluator.hf.space/"
+ self.split = "instruct"
+ self.subset = "full"
+ self.pass_k = "1" # Original: "1,5,10"
+ self.use_global_metric = True
+ self.num_instances = 1140 # Instruct full seting of the dataset
+
+ def annotate(self, request_state: RequestState) -> Any:
+ pass
+
+ @retry(stop_max_attempt_number=3, wait_fixed=4000)
+ def predict_with_retry(self, filename: str):
+ client = Client(self.remote_execute_api)
+ results, evals = client.predict(
+ split=self.split,
+ subset=self.subset,
+ samples=handle_file(filename),
+ pass_k=self.pass_k,
+ api_name="/predict",
+ )
+ pass_at_one = evals["pass@1"]
+ return results, pass_at_one
+
+ def annotate_all(self, request_states: List[RequestState]) -> List[Dict[str, Any]]:
+ assert all(request_state.result is not None for request_state in request_states)
+ assert all(
+ request_state.result is not None and len(request_state.result.completions) == 1
+ for request_state in request_states
+ )
+ assert all(request_state.instance.extra_data for request_state in request_states)
+
+ with TemporaryDirectory() as tmpdir:
+ with open(OUTPUT_FILENAME, "w") as file:
+ hlog(f"Temp Dir: {tmpdir}")
+ res = []
+ for i in range(self.num_instances):
+ init_line = f'{{"task_id": "BigCodeBench/{i}", "solution": ""}}\n'
+ res.append(init_line)
+ for request_state in request_states:
+ line: str
+ assert request_state.result is not None
+ model_output_text = request_state.result.completions[0].text
+ solution = code_extract(model_output_text)
+ assert request_state.instance.id is not None
+ idx = int(request_state.instance.id.split("/")[-1])
+ res[idx] = json.dumps({"task_id": request_state.instance.id, "solution": solution}) + "\n"
+ for line in res:
+ file.write(line)
+
+ try:
+ results, _ = self.predict_with_retry(OUTPUT_FILENAME)
+ ret = [
+ {"bigcodebench": {"pass_at_one": results["eval"][state.instance.id][0]["status"] == "pass"}}
+ for state in request_states
+ ]
+ return ret
+ except Exception as e:
+ hlog(f"Failed to complete the operation after 3 attempts. Exception: {e}")
+ raise e
diff --git a/src/helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt b/src/helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt
new file mode 100644
index 00000000000..0f8ee944751
--- /dev/null
+++ b/src/helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt
@@ -0,0 +1,152 @@
+# CONTEXT #
+I am a teacher, and I have some high-level math problems. I am tasked with evaluating the correctness of a student's answer.
+Below, I am provided with a problem and a reference answer. Additionally, a student's answer is provided. My job is to assess whether the student's answer captures the same meaning as the reference answer, even when expressed with different wording or format.
+
+# OBJECTIVE #
+I need you to judge whether the student's answer is correct given the ground truth answer.
+
+Your tasks include:
+A. Identify Mathematical or Notational Equivalence: Pay special attention to any LaTeX expressions in both answers. Confirm that the mathematical relationships, variables, and operations conveyed are equivalent.
+B. Provide a Justification: Conclude with a brief explanation as to why you believe the student's output is correct or incorrect, highlighting any key differences in meaning or content.
+
+# STYLE #
+Teaching report.
+
+# TONE #
+Professional, scientific.
+
+# AUDIENCE #
+Students. Enable them to better understand whether the answer they produce is correct.
+
+# RESPONSE: MARKDOWN REPORT #
+## Student Final Answer
+[Extract the student's final answer, which is enclosed in "\\boxed{}".]
+## Equivalence Judgement
+[Whether the student's answer share the same meaning with the reference answer. (TRUE or FALSE)]
+## Justification
+[Conclude with a brief explanation as to why you believe the student's answer is correct or incorrect.]
+
+
+# ATTENTION #
+ - The reference answer is ALWAYS correct. You should carefully judge whether the student gives the same answer as reference answer.
+ - The Equivalence Judgement is only TRUE or FALSE. The answer is FALSE even if the student's final answer almost correct with a minor mistakes.
+ - The answer is contained within the "boxed" section, so you can focus solely on comparing the content in the student's answer box with the reference answer, without needing to consider the intermediate steps.
+ - Add "=== report over ===" at the end of the report.
+
+
+**Question**:
+The numbers $1447$ , $1005$ and $1231$ have something in common: each is a $4$ -digit number beginning with $1$ that has exactly two identical digits. How many such numbers are there?
+
+**Reference Answer**
+432
+
+**Student Solution**:
+To determine how many 4-digit numbers beginning with 1 have exactly two identical digits, we need to consider the structure of these numbers. A number of this form can be represented as \(1abc\), where \(a\), \(b\), and \(c\) are digits from 0 to 9, and exactly one of the digits \(a\), \(b\), or \(c\) is 1, while the other two are different from each other and different from 1.
+
+We can break down the problem into the following steps:
+
+1. **Choose the digit that is 1**: There are 3 positions (a, b, c) where the digit 1 can be placed. So, there are 3 choices for the position of the digit 1.
+
+2. **Choose the other two digits**: The remaining two digits must be different from each other and different from 1. Therefore, we have 9 choices for the first of these two digits (0-9 except 1) and 8 choices for the second digit (0-9 except 1 and the first digit). This gives us \(9 \times 8 = 72\) choices for the other two digits.
+
+Combining these choices, the total number of 4-digit numbers beginning with 1 that have exactly two identical digits is:
+\[3 \times 72 = 216.\]
+
+Thus, the number of such numbers is \(\boxed{216}\).
+
+
+
+## Student Final Answer
+216
+
+## Equivalence Judgement
+FALSE
+
+## Justification
+The student's answer of 216 is incorrect in the context of the problem, which asks for the total count of 4-digit numbers beginning with 1 that have exactly two identical digits. The reference answer is 432.
+In the student's solution, they consider only cases where the digit '1' is one of the identical digits. However, the problem also includes the scenario where the identical digits could be different from '1'. Thus, the student's calculation does not account for all valid configurations. The discrepancy in figures indicates that the student's answer does not share the same meaning as the reference answer.
+
+=== report over ===
+
+
+
+**Question**:
+Susana and Brenda play a game writing polynomials on the board. Susana starts and they play taking turns.
+1) On the preparatory turn (turn 0), Susana choose a positive integer $n_0$ and writes the polynomial $P_0(x)=n_0$.
+2) On turn 1, Brenda choose a positive integer $n_1$, different from $n_0$, and either writes the polynomial
+
+$$P_1(x)=n_1x+P_0(x) \textup{ or } P_1(x)=n_1x-P_0(x)$$
+
+3) In general, on turn $k$, the respective player chooses an integer $n_k$, different from $n_0, n_1, \ldots, n_{k-1}$, and either writes the polynomial
+$$P_k(x)=n_kx^k+P_{k-1}(x) \textup{ or } P_k(x)=n_kx^k-P_{k-1}(x)$$
+The first player to write a polynomial with at least one whole whole number root wins. Find and describe a winning strategy.
+
+**Reference Answer**
+\\text{No winning strategy; the game can last indefinitely.}
+
+**Student Solution**:
+To determine a winning strategy for Susana, we need to analyze the game and the conditions under which a player can win. The goal is to write a polynomial that has at least one integer root.
+
+Let's start by examining the polynomials that are written on the board after each turn.
+
+1. On turn 0, Susana writes the polynomial \( P_0(x) = n_0 \), where \( n_0 \) is a positive integer. This polynomial is a constant polynomial with no integer roots unless \( n_0 = 0 \), which is not allowed since \( n_0 \) is a positive integer.
+
+2. On turn 1, Brenda chooses a positive integer \( n_1 \) different from \( n_0 \) and writes either \( P_1(x) = n_1 x + P_0(x) = n_1 x + n_0 \) or \( P_1(x) = n_1 x - P_0(x) = n_1 x - n_0 \). In both cases, \( P_1(x) \) is a linear polynomial. A linear polynomial \( ax + b \) has an integer root if and only if \( b/a \) is an integer. Therefore, Brenda can choose \( n_1 \) and \( n_0 \) such that \( n_1 n_0 \) is divisible by \( n_1 \), which is always possible since \( n_1 \) and \( n_0 \) are different positive integers. For example, if \( n_1 = 2 \) and \( n_0 = 2 \), then \( P_1(x) = 2x - 2 \) has the integer root \( x = 1 \). However, since \( n_1 \) must be different from \( n_0 \), we need to consider the next turn.
+
+3. On turn 2, Susana chooses a positive integer \( n_2 \) different from \( n_0 \) and \( n_1 \) and writes either \( P_2(x) = n_2 x^2 + P_1(x) \) or \( P_2(x) = n_2 x^2 - P_1(x) \). To ensure a win, Susana should choose \( n_2 \) such that the resulting polynomial has an integer root. Notice that if \( P_1(x) = n_1 x + n_0 \) or \( P_1(x) = n_1 x - n_0 \), then \( P_2(x) = n_2 x^2 + P_1(x) \) or \( P_2(x) = n_2 x^2 - P_1(x) \) will have an integer root if \( n_2 \) is chosen appropriately. For example, if \( P_1(x) = 2x - 2 \), then \( P_2(x) = n_2 x^2 + 2x - 2 \) or \( P_2(x) = n_2 x^2 - 2x + 2 \) will have an integer root if \( n_2 \) is chosen such that the polynomial can be factored to have an integer root.
+
+By following this strategy, Susana can always choose \( n_2 \) such that the resulting polynomial has an integer root. Therefore, Susana has a winning strategy.
+
+The winning strategy for Susana is to choose \( n_2 \) such that the resulting polynomial has an integer root. Specifically, if \( P_1(x) = n_1 x + n_0 \) or \( P_1(x) = n_1 x - n_0 \), then Susana should choose \( n_2 \) such that \( P_2(x) = n_2 x^2 + P_1(x) \) or \( P_2(x) = n_2 x^2 - P_1(x) \) has an integer root.
+
+The answer is: \(\boxed{\text{Susana has a winning strategy.}}\)
+
+
+
+
+## Student Final Answer
+\text{Susana has a winning strategy.}
+
+## Equivalence Judgement
+FALSE
+
+## Justification
+The student's answer states that "Susana has a winning strategy," which is not in agreement with the reference answer indicating "No winning strategy; the game can last indefinitely." The essence of the problem is that neither player can guarantee a win because the game can go on indefinitely without either player being able to write a polynomial with an integer root. The student's conclusion contradicts this, suggesting that a specific winning strategy exists for Susana, which is fundamentally incorrect according to the problem's conditions. Thus, the student's answer does not capture the same meaning as the reference answer.
+
+
+=== report over ===
+
+
+**Question**:
+Determine if there exists a (three-variable) polynomial $P(x,y,z)$ with integer coefficients satisfying the following property: a positive integer $n$ is [i]not[/i] a perfect square if and only if there is a triple $(x,y,z)$ of positive integers such that $P(x,y,z) = n$.
+
+**Reference Answer**
+P(x,y,z) = x^2 + y^2 + z^2 + 2xyz
+
+**Student Solution**:
+ **Claim:** There exists a three-variable polynomial $P(x, y
+
+
+
+## Student Final Answer
+None
+
+## Equivalence Judgement
+FALSE
+
+## Justification
+The student's answer does not provide a final conclusion and the final answer is missing, which indicates a lack of conclusion.
+
+=== report over ===
+
+
\ No newline at end of file
diff --git a/src/helm/benchmark/annotation/omni_math_annotator.py b/src/helm/benchmark/annotation/omni_math_annotator.py
new file mode 100644
index 00000000000..6081c376bb0
--- /dev/null
+++ b/src/helm/benchmark/annotation/omni_math_annotator.py
@@ -0,0 +1,69 @@
+from typing import Any
+from importlib.resources import files
+
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.annotation.annotator import Annotator
+from helm.clients.auto_client import AutoClient
+from helm.common.request import Request
+
+
+# Following https://github.com/KbsdJames/Omni-MATH/blob/main/GPT_eval/get_result.py
+def parse_report(report):
+ parts = report.split("## ")
+ data = {}
+ for part in parts[1:]:
+ lines = part.strip().split("\n")
+ title = lines[0].strip()
+ content = "\n".join(lines[1:]).strip()
+ if title == "Justification":
+ data[title] = content
+ else:
+ data[title] = lines[1].strip() if len(lines) > 1 else ""
+ return data
+
+
+class OmniMATHAnnotator(Annotator):
+ """The Omni-MATH autograder."""
+
+ name = "omni_math"
+
+ def __init__(self, auto_client: AutoClient):
+ self._auto_client = auto_client
+ template_path = files("src.helm.benchmark.annotation.omni_math").joinpath("gpt_evaluation_template.txt")
+ with template_path.open("r") as file:
+ self._score_template = file.read()
+
+ def annotate(self, request_state: RequestState) -> Any:
+ assert request_state.result
+ assert len(request_state.result.completions) == 1
+ prompt_template = self._score_template
+ model_output_text = request_state.result.completions[0].text
+ annotator_prompt = (
+ prompt_template.replace("{{Problem}}", request_state.instance.input.text)
+ .replace("{{Reference Answer}}", request_state.instance.references[0].output.text)
+ .replace("{{Solution}}", model_output_text)
+ )
+ if not model_output_text.strip():
+ return {"prompt_text": annotator_prompt, "correctness": 0.0}
+
+ annotator_request = Request(
+ model="openai/gpt-4o-2024-05-13",
+ model_deployment="openai/gpt-4o-2024-05-13",
+ prompt=annotator_prompt,
+ temperature=0.0,
+ max_tokens=1000,
+ )
+ annotator_response = self._auto_client.make_request(annotator_request)
+ if not annotator_response.success:
+ raise Exception(f"Annotation request failed: {annotator_response.error}")
+ assert len(annotator_response.completions) == 1
+ annotator_response_text = annotator_response.completions[0].text
+
+ info = parse_report(annotator_response_text)
+
+ correctness = info.get("Equivalence Judgement", "FALSE")
+
+ if correctness == "TRUE":
+ return {"prompt_text": annotator_prompt, "correctness": 1.0}
+ else:
+ return {"prompt_text": annotator_prompt, "correctness": 0.0}
diff --git a/src/helm/benchmark/annotation_executor.py b/src/helm/benchmark/annotation_executor.py
index 9bc30d534b2..c82143608bd 100644
--- a/src/helm/benchmark/annotation_executor.py
+++ b/src/helm/benchmark/annotation_executor.py
@@ -92,18 +92,28 @@ def execute(self, scenario_state: ScenarioState) -> ScenarioState:
hlog("No annotators to run.")
return scenario_state
- # Do it!
- def do_it(request_state: RequestState) -> RequestState:
- assert scenario_state.annotator_specs is not None
- return self.process(scenario_state.annotator_specs, request_state)
+ if all(
+ getattr(self.factory.get_annotator(spec), "use_global_metric", False)
+ for spec in scenario_state.annotator_specs
+ ):
+ # Do it!
+ request_states = self.process_all(
+ scenario_state.annotator_specs, scenario_state.request_states # processing all request together
+ )
- self.annotator_specs = scenario_state.annotator_specs
+ else:
+ # Do it!
+ def do_it(request_state: RequestState) -> RequestState:
+ assert scenario_state.annotator_specs is not None
+ return self.process(scenario_state.annotator_specs, request_state)
- request_states = parallel_map(
- do_it,
- scenario_state.request_states,
- parallelism=self.execution_spec.parallelism,
- )
+ self.annotator_specs = scenario_state.annotator_specs
+
+ request_states = parallel_map(
+ do_it,
+ scenario_state.request_states,
+ parallelism=self.execution_spec.parallelism,
+ )
hlog(f"Annotated {len(request_states)} requests")
return ScenarioState(
@@ -122,3 +132,14 @@ def process(self, annotator_specs: List[AnnotatorSpec], state: RequestState) ->
except Exception as e:
raise AnnotationExecutorError(f"{str(e)} Request: {state.request}") from e
return replace(state, annotations=annotations)
+
+ def process_all(self, annotator_specs: List[AnnotatorSpec], states: List[RequestState]) -> List[RequestState]:
+ annotations: Dict[str, Any] = {}
+ try:
+ for annotator_spec in annotator_specs:
+ annotator: Annotator = self.factory.get_annotator(annotator_spec)
+ new_annotations = annotator.annotate_all(states)
+ annotations[annotator.name] = new_annotations
+ except Exception as e:
+ raise AnnotationExecutorError(f"{str(e)} Request: {[state.request for state in states]}") from e
+ return [replace(state, annotations=new_annotations[idx]) for idx, state in enumerate(states)]
diff --git a/src/helm/benchmark/metrics/bigcodebench_metrics.py b/src/helm/benchmark/metrics/bigcodebench_metrics.py
new file mode 100644
index 00000000000..7d80a328f21
--- /dev/null
+++ b/src/helm/benchmark/metrics/bigcodebench_metrics.py
@@ -0,0 +1,25 @@
+from typing import List
+
+from helm.benchmark.adaptation.adapter_spec import AdapterSpec
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.benchmark.metrics.statistic import Stat
+
+
+class BigCodeBenchMetric(Metric):
+ """Score metrics for BigCodeBench."""
+
+ def evaluate_generation(
+ self,
+ adapter_spec: AdapterSpec,
+ request_state: RequestState,
+ metric_service: MetricService,
+ eval_cache_path: str,
+ ) -> List[Stat]:
+ assert request_state.annotations
+ score = request_state.annotations["bigcodebench"]["pass_at_one"]
+ return [
+ Stat(MetricName("bigcodebench_p@1")).add(score),
+ ]
diff --git a/src/helm/benchmark/metrics/omni_math_metrics.py b/src/helm/benchmark/metrics/omni_math_metrics.py
new file mode 100644
index 00000000000..c63c9f4020b
--- /dev/null
+++ b/src/helm/benchmark/metrics/omni_math_metrics.py
@@ -0,0 +1,25 @@
+from typing import List
+
+from helm.benchmark.adaptation.adapter_spec import AdapterSpec
+from helm.benchmark.adaptation.request_state import RequestState
+from helm.benchmark.metrics.metric import Metric
+from helm.benchmark.metrics.metric_name import MetricName
+from helm.benchmark.metrics.metric_service import MetricService
+from helm.benchmark.metrics.statistic import Stat
+
+
+class OmniMATHMetric(Metric):
+ """Score metrics for Omni-MATH."""
+
+ def evaluate_generation(
+ self,
+ adapter_spec: AdapterSpec,
+ request_state: RequestState,
+ metric_service: MetricService,
+ eval_cache_path: str,
+ ) -> List[Stat]:
+ assert request_state.annotations
+ score = request_state.annotations["omni_math"]["correctness"]
+ return [
+ Stat(MetricName("omni_math_accuracy")).add(score),
+ ]
diff --git a/src/helm/benchmark/run_specs/lite_run_specs.py b/src/helm/benchmark/run_specs/lite_run_specs.py
index 6dc83be0f67..b3be2e55e72 100644
--- a/src/helm/benchmark/run_specs/lite_run_specs.py
+++ b/src/helm/benchmark/run_specs/lite_run_specs.py
@@ -478,3 +478,64 @@ def get_wildbench_spec(subset: str, use_model_outputs: str = "False") -> RunSpec
metric_specs=metric_specs,
groups=["wildbench"],
)
+
+
+@run_spec_function("bigcodebench")
+def get_bigcodebench_spec(version: str) -> RunSpec:
+
+ scenario_spec = ScenarioSpec(
+ class_name="helm.benchmark.scenarios.bigcodebench_scenario.BigCodeBenchScenario", args={"version": version}
+ )
+
+ # Adapted from https://github.dev/bigcode-project/bigcodebench/blob/main/bigcodebench/evaluate.py
+ adapter_spec = AdapterSpec(
+ method=ADAPT_GENERATION,
+ input_prefix="",
+ output_prefix="",
+ max_tokens=1280,
+ num_outputs=1,
+ temperature=0.0,
+ global_prefix="Please provide a self-contained Python script "
+ "that solves the following problem in a markdown code block:",
+ )
+ annotator_specs = [
+ AnnotatorSpec(class_name="helm.benchmark.annotation.bigcodebench_annotator.BigCodeBenchAnnotator")
+ ]
+ metric_specs = [MetricSpec(class_name="helm.benchmark.metrics.bigcodebench_metrics.BigCodeBenchMetric")]
+
+ return RunSpec(
+ name="bigcodebench",
+ scenario_spec=scenario_spec,
+ adapter_spec=adapter_spec,
+ annotators=annotator_specs,
+ metric_specs=metric_specs,
+ groups=["bigcodebench"],
+ )
+
+
+@run_spec_function("omni_math")
+def get_omni_math_spec() -> RunSpec:
+
+ scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.omni_math_scenario.OmniMATHScenario")
+
+ adapter_spec = AdapterSpec(
+ method=ADAPT_GENERATION,
+ input_prefix="",
+ output_prefix="",
+ max_tokens=1000,
+ num_outputs=1,
+ temperature=0.0,
+ )
+ annotator_specs = [AnnotatorSpec(class_name="helm.benchmark.annotation.omni_math_annotator.OmniMATHAnnotator")]
+ metric_specs = get_basic_metric_specs([]) + [
+ MetricSpec(class_name="helm.benchmark.metrics.omni_math_metrics.OmniMATHMetric")
+ ]
+
+ return RunSpec(
+ name="omni_math",
+ scenario_spec=scenario_spec,
+ adapter_spec=adapter_spec,
+ annotators=annotator_specs,
+ metric_specs=metric_specs,
+ groups=["omni_math"],
+ )
diff --git a/src/helm/benchmark/scenarios/bigcodebench_scenario.py b/src/helm/benchmark/scenarios/bigcodebench_scenario.py
new file mode 100644
index 00000000000..19b8ceb699a
--- /dev/null
+++ b/src/helm/benchmark/scenarios/bigcodebench_scenario.py
@@ -0,0 +1,59 @@
+import datasets
+import os
+from typing import List
+from helm.benchmark.scenarios.scenario import (
+ Scenario,
+ Instance,
+ TEST_SPLIT,
+ Input,
+)
+from helm.common.general import ensure_directory_exists
+
+
+VERSIONS = ["v0.1.2"]
+
+
+class BigCodeBenchScenario(Scenario):
+ """BigCodeBench: Benchmarking Code Generation with Diverse Function Calls and Complex Instructions
+
+ BigCodeBench is an easy-to-use benchmark for solving practical and challenging tasks via code.
+ It aims to evaluate the true programming capabilities of large language models (LLMs) in a more realistic setting.
+ The benchmark is designed for HumanEval-like function-level code generation tasks,
+ but with much more complex instructions and diverse function calls."""
+
+ name = "bigcodebench"
+ description = "Benchmarking Code Generation with Diverse Function Calls and Complex Instructions"
+ tags = ["coding"]
+
+ def __init__(self, version: str):
+ super().__init__()
+ assert version in VERSIONS, "Unknown version: {}".format(version)
+ self.version = version
+
+ def get_instances(self, output_path: str) -> List[Instance]:
+ # Get BigCodeBench from HuggingFace
+ cache_dir = os.path.join(output_path, "data")
+ ensure_directory_exists(cache_dir)
+ dataset = datasets.load_dataset(
+ "bigcode/bigcodebench",
+ revision="35a015f216382cb88997b91b9400357a79e55141", # for v0.1.2
+ cache_dir=cache_dir,
+ split=self.version,
+ )
+ assert isinstance(dataset, datasets.Dataset)
+
+ # Read all instances
+ instances: List[Instance] = []
+ for idx, row in enumerate(dataset):
+
+ input = Input(text=row["instruct_prompt"])
+ instance = Instance(
+ input=input,
+ references=[],
+ split=TEST_SPLIT,
+ id=row["task_id"],
+ extra_data={"task_id": row["task_id"]},
+ )
+ instances.append(instance)
+
+ return instances
diff --git a/src/helm/benchmark/scenarios/omni_math_scenario.py b/src/helm/benchmark/scenarios/omni_math_scenario.py
new file mode 100644
index 00000000000..64ced56c25d
--- /dev/null
+++ b/src/helm/benchmark/scenarios/omni_math_scenario.py
@@ -0,0 +1,53 @@
+import datasets
+import os
+from typing import List
+from helm.benchmark.scenarios.scenario import (
+ Scenario,
+ Instance,
+ Reference,
+ TEST_SPLIT,
+ Input,
+ Output,
+ CORRECT_TAG,
+)
+from helm.common.general import ensure_directory_exists
+
+
+class OmniMATHScenario(Scenario):
+ """Omni-MATH: A Universal Olympiad Level Mathematic Benchmark for Large Language Models
+
+ Omni-MATH is a comprehensive and challenging benchmark specifically designed to assess LLMs' mathematical
+ reasoning at the Olympiad level. The dataset focuses exclusively on Olympiad mathematics and comprises a \
+ vast collection of 4428 competition-level problems. These problems are meticulously categorized into 33 \
+ (and potentially more) sub-domains and span across 10 distinct difficulty levels, enabling a nuanced \
+ analysis of model performance across various mathematical disciplines and levels of complexity.."""
+
+ name = "omni_math"
+ description = "A Universal Olympiad Level Mathematic Benchmark for Large Language Models"
+ tags = ["math"]
+
+ def get_instances(self, output_path: str) -> List[Instance]:
+ # Get Omni-MATH from HuggingFace
+ cache_dir = os.path.join(output_path, "data")
+ ensure_directory_exists(cache_dir)
+ dataset = datasets.load_dataset(
+ "KbsdJames/Omni-MATH",
+ revision="40ba231d8f16e29ecd40e6407e2c8640145a8f62",
+ cache_dir=cache_dir,
+ split="test",
+ )
+ assert isinstance(dataset, datasets.Dataset)
+
+ # Read all instances
+ instances: List[Instance] = []
+ for idx, row in enumerate(dataset):
+
+ input = Input(text=row["problem"])
+ instance = Instance(
+ input=input,
+ references=[Reference(Output(text=row["answer"]), tags=[CORRECT_TAG])],
+ split=TEST_SPLIT,
+ )
+ instances.append(instance)
+
+ return instances
diff --git a/src/helm/benchmark/scenarios/test_bigcodebench_scenario.py b/src/helm/benchmark/scenarios/test_bigcodebench_scenario.py
new file mode 100644
index 00000000000..1a3a8197bee
--- /dev/null
+++ b/src/helm/benchmark/scenarios/test_bigcodebench_scenario.py
@@ -0,0 +1,27 @@
+import pytest
+from tempfile import TemporaryDirectory
+
+from helm.benchmark.scenarios.bigcodebench_scenario import BigCodeBenchScenario
+from helm.benchmark.scenarios.scenario import Input, TEST_SPLIT
+
+
+@pytest.mark.scenarios
+def test_bigcodebench_scenario_get_instances():
+ bigcodebench_scenario = BigCodeBenchScenario("v0.1.2")
+ with TemporaryDirectory() as tmpdir:
+ instances = bigcodebench_scenario.get_instances(tmpdir)
+ assert len(instances) == 1140
+ assert instances[0].input == Input(
+ text=(
+ "Calculates the average of the sums of absolute differences between each pair "
+ "of consecutive numbers for all permutations of a given list. Each permutation "
+ "is shuffled before calculating the differences. Args: - numbers (list): A list "
+ "of numbers. Default is numbers from 1 to 10.\nThe function should output with:\n"
+ " float: The average of the sums of absolute differences for each shuffled permutation "
+ "of the list.\nYou should write self-contained code starting with:\n```\nimport itertools\n"
+ "from random import shuffle\ndef task_func(numbers=list(range(1, 3))):\n```"
+ )
+ )
+ assert instances[0].split == TEST_SPLIT
+ assert instances[0].extra_data
+ assert instances[0].extra_data["task_id"] == "BigCodeBench/0"
diff --git a/src/helm/benchmark/scenarios/test_omni_math_scenario.py b/src/helm/benchmark/scenarios/test_omni_math_scenario.py
new file mode 100644
index 00000000000..915086d14c8
--- /dev/null
+++ b/src/helm/benchmark/scenarios/test_omni_math_scenario.py
@@ -0,0 +1,27 @@
+import pytest
+from tempfile import TemporaryDirectory
+
+from helm.benchmark.scenarios.omni_math_scenario import OmniMATHScenario
+from helm.benchmark.scenarios.scenario import Input, TEST_SPLIT
+
+
+@pytest.mark.scenarios
+def test_omni_math_scenario_get_instances():
+ omni_math_scenario = OmniMATHScenario()
+ with TemporaryDirectory() as tmpdir:
+ instances = omni_math_scenario.get_instances(tmpdir)
+ assert len(instances) == 4428
+ assert instances[0].input == Input(
+ text=(
+ "Let $ n(\\ge2) $ be a positive integer. Find the minimum $ m $, "
+ "so that there exists $x_{ij}(1\\le i ,j\\le n)$ satisfying:\n(1)For every "
+ "$1\\le i ,j\\le n, x_{ij}=max\\{x_{i1},x_{i2},...,x_{ij}\\} $ or $ x_{ij}="
+ "max\\{x_{1j},x_{2j},...,x_{ij}\\}.$\n(2)For every $1\\le i \\le n$, there "
+ "are at most $m$ indices $k$ with $x_{ik}=max\\{x_{i1},x_{i2},...,x_{ik}\\}."
+ "$\n(3)For every $1\\le j \\le n$, there are at most $m$ indices $k$ with "
+ "$x_{kj}=max\\{x_{1j},x_{2j},...,x_{kj}\\}.$"
+ )
+ )
+ assert instances[0].split == TEST_SPLIT
+ assert instances[0].references
+ assert instances[0].references[0].output.text == "1 + \\left\\lceil \\frac{n}{2} \\right\\rceil"
diff --git a/src/helm/benchmark/static/schema_lite_v2.yaml b/src/helm/benchmark/static/schema_lite_v2.yaml
index 0252a519f42..f94518eb379 100644
--- a/src/helm/benchmark/static/schema_lite_v2.yaml
+++ b/src/helm/benchmark/static/schema_lite_v2.yaml
@@ -103,6 +103,16 @@ metrics:
short_display_name: WB Score
description: Score of the AI output judged by GPT-4o.
lower_is_better: false
+ - name: bigcodebench_p@1
+ display_name: BigCodeBench Pass@1
+ short_display_name: Pass@1
+ description: Accuracy of the AI output judged by BigCodeBench official evaluator.
+ lower_is_better: false
+ - name: omni_math_accuracy
+ display_name: Omni-MATH Accuracy
+ short_display_name: Acc
+ description: Accuracy of the AI output judged by GPT-4.
+ lower_is_better: false
############################################################
perturbations: []
@@ -147,6 +157,8 @@ run_groups:
- gpqa
- ifeval
- wildbench
+ - bigcodebench
+ - omni_math
- name: mmlu_pro
display_name: MMLU-Pro
@@ -215,3 +227,37 @@ run_groups:
who: "real-world users"
when: "2024"
language: English
+
+ - name: bigcodebench
+ display_name: BigCodeBench
+ description: BigCodeBench
+ metric_groups:
+ - accuracy
+ - efficiency
+ - general_information
+ environment:
+ main_name: bigcodebench_p@1
+ main_split: test
+ taxonomy:
+ task: "code generation"
+ what: "function-level code generation tasks with complex instructions and diverse function calls"
+ who: "human annotators"
+ when: "2024"
+ language: English
+
+ - name: omni_math
+ display_name: Omni-MATH
+ description: Omni-MATH
+ metric_groups:
+ - accuracy
+ - efficiency
+ - general_information
+ environment:
+ main_name: omni_math_accuracy
+ main_split: test
+ taxonomy:
+ task: "mathematics"
+ what: "universal Olympiad level mathematic benchmark"
+ who: "human annotators"
+ when: "2024"
+ language: English