diff --git a/gpt_engineer/benchmark/__init__.py b/gpt_engineer/benchmark/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/gpt_engineer/benchmark/__main__.py b/gpt_engineer/benchmark/__main__.py index e58ca5dbd2..d8c1f58732 100644 --- a/gpt_engineer/benchmark/__main__.py +++ b/gpt_engineer/benchmark/__main__.py @@ -20,6 +20,7 @@ The standard boilerplate for invoking the main function when the script is executed. """ import importlib +import os.path from typing import Annotated, Optional @@ -29,6 +30,7 @@ from langchain.globals import set_llm_cache from gpt_engineer.applications.cli.main import load_env_if_needed +from gpt_engineer.benchmark.bench_config import BenchConfig from gpt_engineer.benchmark.benchmarks.load import get_benchmark from gpt_engineer.benchmark.run import print_results, run @@ -69,12 +71,9 @@ def main( help="python file that contains a function called 'default_config_agent'" ), ], - benchmarks: Annotated[ - str, typer.Argument(help="benchmark name(s) separated by ','") - ], - task_name: Annotated[ + bench_config: Annotated[ Optional[str], typer.Argument(help="optional task name in benchmark") - ] = None, + ] = os.path.join(os.path.dirname(__file__), "default_bench_config.toml"), verbose: Annotated[ bool, typer.Option(help="print results for each task", show_default=False) ] = False, @@ -88,8 +87,8 @@ def main( The file path to the Python module that contains a function called 'default_config_agent'. benchmarks : str A comma-separated string of benchmark names to run. - task_name : Optional[str], default=None - An optional task name to run within the benchmark. + bench_config : Optional[str], default=default_bench_config.toml + Configuration file for choosing which benchmark problems to run. See default config for more details. verbose : bool, default=False A flag to indicate whether to print results for each task. @@ -99,13 +98,27 @@ def main( """ set_llm_cache(SQLiteCache(database_path=".langchain.db")) load_env_if_needed() + config = BenchConfig.from_toml(bench_config) + print("using config file: " + bench_config) + benchmarks = list() + for specific_config_name in vars(config): + specific_config = getattr(config, specific_config_name) + if hasattr(specific_config, "active"): + if specific_config.active: + benchmarks.append(specific_config_name) - benchmarks = benchmarks.split(",") for benchmark_name in benchmarks: - benchmark = get_benchmark(benchmark_name) + benchmark = get_benchmark(benchmark_name, config) + if len(benchmark.tasks) == 0: + print( + benchmark_name + + " was skipped, since no tasks are specified. Increase the number of tasks in the config file at: " + + bench_config + ) + continue agent = get_agent(path_to_agent) - results = run(agent, benchmark, task_name, verbose=verbose) + results = run(agent, benchmark, verbose=verbose) print( f"\n--- Results for agent {path_to_agent}, benchmark: {benchmark_name} ---" ) diff --git a/gpt_engineer/benchmark/bench_config.py b/gpt_engineer/benchmark/bench_config.py new file mode 100644 index 0000000000..aafc38f524 --- /dev/null +++ b/gpt_engineer/benchmark/bench_config.py @@ -0,0 +1,56 @@ +from dataclasses import dataclass, field +from pathlib import Path + +from gpt_engineer.core.project_config import read_config + + +@dataclass +class AppsConfig: + active: bool | None = True + test_start_index: int | None = 0 + test_end_index: int | None = 1 + train_start_index: int | None = 0 + train_end_index: int | None = 0 + + +@dataclass +class MbppConfig: + active: bool | None = True + test_len: int | None = 1 + train_len: int | None = 0 + + +@dataclass +class GptmeConfig: + active: bool | None = True + + +@dataclass +class GptengConfig: + active: bool | None = True + + +@dataclass +class BenchConfig: + """Configuration for the GPT Engineer CLI and gptengineer.app via `gpt-engineer.toml`.""" + + apps: AppsConfig = field(default_factory=AppsConfig) + mbpp: MbppConfig = field(default_factory=MbppConfig) + gptme: GptmeConfig = field(default_factory=GptmeConfig) + gpteng: GptengConfig = field(default_factory=GptengConfig) + + @classmethod + def from_toml(cls, config_file: Path | str): + if isinstance(config_file, str): + config_file = Path(config_file) + config_dict = read_config(config_file) + return cls.from_dict(config_dict) + + @classmethod + def from_dict(cls, config_dict: dict): + return cls( + apps=AppsConfig(**config_dict.get("apps", {})), + mbpp=MbppConfig(**config_dict.get("mbpp", {})), + gptme=GptmeConfig(**config_dict.get("gptme", {})), + gpteng=GptengConfig(**config_dict.get("gpteng", {})), + ) diff --git a/gpt_engineer/benchmark/benchmarks/apps/load.py b/gpt_engineer/benchmark/benchmarks/apps/load.py index 65cf515713..0929aa447f 100644 --- a/gpt_engineer/benchmark/benchmarks/apps/load.py +++ b/gpt_engineer/benchmark/benchmarks/apps/load.py @@ -16,8 +16,8 @@ from datasets import Dataset, DatasetDict, load_dataset, load_from_disk +from gpt_engineer.benchmark.bench_config import AppsConfig from gpt_engineer.benchmark.benchmarks.apps.problem import Problem -from gpt_engineer.benchmark.benchmarks.apps.problems import PROBLEM_IDS from gpt_engineer.benchmark.types import Assertable, Benchmark, Task from gpt_engineer.core.default.disk_execution_env import DiskExecutionEnv from gpt_engineer.core.files_dict import FilesDict @@ -57,12 +57,12 @@ def _get_dataset() -> Union[Dataset, DatasetDict]: print("Dataset not found locally, downloading...") dataset = load_dataset("codeparrot/apps", trust_remote_code=True) - dataset.save_to_disk(DATASET_PATH) + dataset.save_to_disk(str(DATASET_PATH)) return dataset -def load_apps(): +def load_apps(config: AppsConfig) -> Benchmark: """ Loads the APPS benchmark, which consists of a series coding problems. @@ -73,17 +73,19 @@ def load_apps(): """ dataset = _get_dataset() tasks = [] - - problems = [ - Problem( - id=problem["problem_id"], - question=problem["question"], - input_output=problem["input_output"], - starter_code=problem["starter_code"], - ) - for problem in dataset["test"] - if problem["problem_id"] in PROBLEM_IDS - ] + problems = list() + for dataset_type in ["test", "train"]: + problems += [ + Problem( + id=problem["problem_id"], + question=problem["question"], + input_output=problem["input_output"], + starter_code=problem["starter_code"], + ) + for index, problem in enumerate(dataset[dataset_type]) + if (index < config.__getattribute__(dataset_type + "_end_index")) + and (index >= config.__getattribute__(dataset_type + "_start_index")) + ] for problem in problems: prompt = Prompt( @@ -110,6 +112,6 @@ def load_apps(): ) return Benchmark( - name="APPS", + name="apps", tasks=tasks, ) diff --git a/gpt_engineer/benchmark/benchmarks/gpteng/load.py b/gpt_engineer/benchmark/benchmarks/gpteng/load.py index bf575afeef..c9c115234f 100644 --- a/gpt_engineer/benchmark/benchmarks/gpteng/load.py +++ b/gpt_engineer/benchmark/benchmarks/gpteng/load.py @@ -19,11 +19,13 @@ from pathlib import Path +from gpt_engineer.benchmark.bench_config import GptengConfig from gpt_engineer.benchmark.benchmarks.gpteng.eval_tools import ( check_evaluation_component, ) from gpt_engineer.benchmark.types import Assertable, Benchmark, Task from gpt_engineer.core.chat_to_files import chat_to_files_dict +from gpt_engineer.core.prompt import Prompt evaluations = [ { @@ -192,7 +194,7 @@ def eval_to_task(case): return Task( name=case["name"], initial_code=chat_to_files_dict(Path(case["code_blob"]).read_text()), - prompt=prompt, + prompt=Prompt(prompt), command=None, assertions={ f"{e['type']}_{i}": expect_to_assertion(e) @@ -201,7 +203,7 @@ def eval_to_task(case): ) -def load_gpteng(): +def load_gpteng(config: GptengConfig) -> Benchmark: """ Loads the GPT-Eng benchmark, which consists of a series of tasks for evaluation. diff --git a/gpt_engineer/benchmark/benchmarks/gptme/load.py b/gpt_engineer/benchmark/benchmarks/gptme/load.py index 216c7c44db..a611ebb531 100644 --- a/gpt_engineer/benchmark/benchmarks/gptme/load.py +++ b/gpt_engineer/benchmark/benchmarks/gptme/load.py @@ -10,12 +10,13 @@ load_gptme : function Loads the GPT-Me benchmark, which consists of a series of tasks for evaluation. """ +from gpt_engineer.benchmark.bench_config import GptmeConfig from gpt_engineer.benchmark.types import Benchmark, Task from gpt_engineer.core.files_dict import FilesDict from gpt_engineer.core.prompt import Prompt -def load_gptme(): +def load_gptme(config: GptmeConfig) -> Benchmark: """ Loads the GPT-Me benchmark, which consists of a series of tasks for evaluation. diff --git a/gpt_engineer/benchmark/benchmarks/load.py b/gpt_engineer/benchmark/benchmarks/load.py index 6ed8659a26..e065875edf 100644 --- a/gpt_engineer/benchmark/benchmarks/load.py +++ b/gpt_engineer/benchmark/benchmarks/load.py @@ -9,6 +9,7 @@ get_benchmark : function Retrieves a Benchmark object by name. Raises ValueError if the benchmark is unknown. """ +from gpt_engineer.benchmark.bench_config import BenchConfig from gpt_engineer.benchmark.benchmarks.apps.load import load_apps from gpt_engineer.benchmark.benchmarks.gpteng.load import load_gpteng from gpt_engineer.benchmark.benchmarks.gptme.load import load_gptme @@ -23,7 +24,7 @@ } -def get_benchmark(name: str) -> Benchmark: +def get_benchmark(name: str, config: BenchConfig) -> Benchmark: """ Retrieves a Benchmark object by name. Raises ValueError if the benchmark is unknown. @@ -31,6 +32,8 @@ def get_benchmark(name: str) -> Benchmark: ---------- name : str The name of the benchmark to retrieve. + config : BenchConfig + Configuration object for the benchmarks. Returns ------- @@ -44,4 +47,4 @@ def get_benchmark(name: str) -> Benchmark: """ if name not in BENCHMARKS: raise ValueError(f"Unknown benchmark {name}.") - return BENCHMARKS[name]() + return BENCHMARKS[name](config.__getattribute__(name)) diff --git a/gpt_engineer/benchmark/benchmarks/mbpp/load.py b/gpt_engineer/benchmark/benchmarks/mbpp/load.py index 9aefef0d92..5f815be823 100644 --- a/gpt_engineer/benchmark/benchmarks/mbpp/load.py +++ b/gpt_engineer/benchmark/benchmarks/mbpp/load.py @@ -16,8 +16,8 @@ from datasets import Dataset, DatasetDict, load_dataset, load_from_disk +from gpt_engineer.benchmark.bench_config import MbppConfig from gpt_engineer.benchmark.benchmarks.mbpp.problem import Problem -from gpt_engineer.benchmark.benchmarks.mbpp.problems import PROBLEM_IDS from gpt_engineer.benchmark.types import Assertable, Benchmark, Task from gpt_engineer.core.default.disk_execution_env import DiskExecutionEnv from gpt_engineer.core.files_dict import FilesDict @@ -57,12 +57,12 @@ def _get_dataset() -> Union[Dataset, DatasetDict]: print("Dataset not found locally, downloading...") dataset = load_dataset("mbpp", "sanitized", trust_remote_code=True) - dataset.save_to_disk(DATASET_PATH) + dataset.save_to_disk(str(DATASET_PATH)) return dataset -def load_mbpp(): +def load_mbpp(config: MbppConfig) -> Benchmark: """ Loads the MBPP benchmark, which consists of a series coding problems. @@ -73,19 +73,20 @@ def load_mbpp(): """ dataset = _get_dataset() tasks = [] - - problems = [ - Problem( - source_file=problem["source_file"], - task_id=problem["task_id"], - prompt=problem["prompt"], - code=problem["code"], - test_imports=problem["test_imports"], - test_list=problem["test_list"], - ) - for problem in dataset["test"] - if problem["task_id"] in PROBLEM_IDS - ] + problems = [] + for dataset_type in ["test", "train"]: + problems += [ + Problem( + source_file=problem["source_file"], + task_id=problem["task_id"], + prompt=problem["prompt"], + code=problem["code"], + test_imports=problem["test_imports"], + test_list=problem["test_list"], + ) + for index, problem in enumerate(dataset[dataset_type]) + if index < config.__getattribute__(dataset_type + "_len") + ] for problem in problems: prompt = Prompt( @@ -109,6 +110,6 @@ def load_mbpp(): ) return Benchmark( - name="MBPP", + name="mbpp", tasks=tasks, ) diff --git a/gpt_engineer/benchmark/default_bench_config.toml b/gpt_engineer/benchmark/default_bench_config.toml new file mode 100644 index 0000000000..5da0079454 --- /dev/null +++ b/gpt_engineer/benchmark/default_bench_config.toml @@ -0,0 +1,19 @@ +# For apps, the maximal range is 0:5000 for both train and test +[apps] +active = true +test_start_index = 0 +test_end_index = 2 +train_start_index = 0 +train_end_index = 2 + +# For mbpp, the maximal range is 0:47 +[mbpp] +active = true +test_len = 2 +train_len = 2 + +[gpteng] +active = true + +[gptme] +active = true diff --git a/gpt_engineer/benchmark/run.py b/gpt_engineer/benchmark/run.py index 6373f89a06..c2de03950f 100644 --- a/gpt_engineer/benchmark/run.py +++ b/gpt_engineer/benchmark/run.py @@ -14,7 +14,7 @@ """ import time -from typing import List, Optional +from typing import List from gpt_engineer.benchmark.types import Assertable, Benchmark, TaskResult from gpt_engineer.core.base_agent import BaseAgent @@ -24,7 +24,6 @@ def run( agent: BaseAgent, benchmark: Benchmark, - task_name: Optional[str] = None, verbose=False, ) -> List[TaskResult]: """ @@ -36,8 +35,6 @@ def run( The agent to use for running the benchmark tasks. benchmark : Benchmark The benchmark containing the tasks to run. - task_name : Optional[str], default=None - An optional name of a specific task to run within the benchmark. verbose : bool, default=False A flag to indicate whether to print verbose output during the benchmark. diff --git a/pyproject.toml b/pyproject.toml index f3bf50b5a6..0011553a80 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -96,3 +96,8 @@ section-order = [ combine-as-imports = true split-on-trailing-comma = false lines-between-types = 1 + +[tool.pytest.ini_options] +markers = [ + "requires_key: marks tests as requiring access to a valid OPENAI_API_KEY (deselect with '-m \"not requires_key\"')", +] diff --git a/tests/benchmark/test_BenchConfig.py b/tests/benchmark/test_BenchConfig.py new file mode 100644 index 0000000000..87619ae665 --- /dev/null +++ b/tests/benchmark/test_BenchConfig.py @@ -0,0 +1,96 @@ +# Generated by CodiumAI + +import pytest + +from gpt_engineer.benchmark.bench_config import ( + AppsConfig, + BenchConfig, + GptengConfig, + GptmeConfig, + MbppConfig, +) + + +class TestBenchConfig: + # Creating a BenchConfig object with default values should return an instance of BenchConfig with all attributes set to their default values. + def test_default_values(self): + config = BenchConfig() + assert isinstance(config.apps, AppsConfig) + assert isinstance(config.mbpp, MbppConfig) + assert isinstance(config.gptme, GptmeConfig) + assert isinstance(config.gpteng, GptengConfig) + assert config.apps.active is True + assert config.apps.test_start_index == 0 + assert config.apps.test_end_index == 1 + assert config.apps.train_start_index == 0 + assert config.apps.train_end_index == 0 + assert config.mbpp.active is True + assert config.mbpp.test_len == 1 + assert config.mbpp.train_len == 0 + assert config.gptme.active is True + assert config.gpteng.active is True + + # Creating a BenchConfig object with specific values should return an instance of BenchConfig with the specified attributes set to the specified values. + def test_specific_values(self): + config = BenchConfig( + apps=AppsConfig( + active=False, + test_start_index=1, + test_end_index=2, + train_start_index=3, + train_end_index=4, + ), + mbpp=MbppConfig(active=False, test_len=5, train_len=6), + gptme=GptmeConfig(active=False), + gpteng=GptengConfig(active=False), + ) + assert isinstance(config.apps, AppsConfig) + assert isinstance(config.mbpp, MbppConfig) + assert isinstance(config.gptme, GptmeConfig) + assert isinstance(config.gpteng, GptengConfig) + assert config.apps.active is False + assert config.apps.test_start_index == 1 + assert config.apps.test_end_index == 2 + assert config.apps.train_start_index == 3 + assert config.apps.train_end_index == 4 + assert config.mbpp.active is False + assert config.mbpp.test_len == 5 + assert config.mbpp.train_len == 6 + assert config.gptme.active is False + assert config.gpteng.active is False + + # Calling the from_dict method with a valid dictionary should return an instance of BenchConfig with attributes set according to the values in the dictionary. + def test_from_dict_valid_dict(self): + config_dict = { + "apps": { + "active": False, + "test_start_index": 1, + "test_end_index": 2, + "train_start_index": 3, + "train_end_index": 4, + }, + "mbpp": {"active": False, "test_len": 5, "train_len": 6}, + "gptme": {"active": False}, + "gpteng": {"active": False}, + } + config = BenchConfig.from_dict(config_dict) + assert isinstance(config.apps, AppsConfig) + assert isinstance(config.mbpp, MbppConfig) + assert isinstance(config.gptme, GptmeConfig) + assert isinstance(config.gpteng, GptengConfig) + assert config.apps.active is False + assert config.apps.test_start_index == 1 + assert config.apps.test_end_index == 2 + assert config.apps.train_start_index == 3 + assert config.apps.train_end_index == 4 + assert config.mbpp.active is False + assert config.mbpp.test_len == 5 + assert config.mbpp.train_len == 6 + assert config.gptme.active is False + assert config.gpteng.active is False + + # Calling the from_toml method with an invalid path to a TOML file should raise an appropriate exception. + def test_from_toml_invalid_path(self): + config_file = "invalid_config.toml" + with pytest.raises(Exception): + BenchConfig.from_toml(config_file)