From a131a6e799f3180e091805fc0debe5f637aacc5c Mon Sep 17 00:00:00 2001 From: Ben Feuer Date: Fri, 20 Dec 2024 09:21:18 -0500 Subject: [PATCH 1/3] Update pyproject.toml --- pyproject.toml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 824c115..da53e69 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,7 +39,7 @@ dependencies = [ "pandas", "scipy", "scikit-learn", - "faiss-cpu==1.7.4", + "faiss-cpu>=1.7.4", # ML Experiment Tracking & Visualization "wandb", @@ -153,7 +153,6 @@ dev = [ ] eval = [ # Using exact package names from their setup files - "fschat @ file:eval/chat_benchmarks/MTBench", # Changed to match the likely package name "lm-eval[vllm]@git+https://github.com/EtashGuha/lm-evaluation-harness@etashg/tokenize_fix", ] From 42a128f3edfe96ce72705b88fcfaa0172a92d08f Mon Sep 17 00:00:00 2001 From: Ben Feuer Date: Fri, 20 Dec 2024 09:21:44 -0500 Subject: [PATCH 2/3] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 443c70f..67c3d22 100644 --- a/README.md +++ b/README.md @@ -32,6 +32,7 @@ conda activate evalchemy # Install dependencies pip install -e ".[eval]" pip install -e eval/chat_benchmarks/alpaca_eval +pip install -e eval/chat_benchmarks/MTBench # Log into HuggingFace for datasets and models. huggingface-cli login From 1aa6d1680c2a8d6992793590af8a13fdfcd5b9ca Mon Sep 17 00:00:00 2001 From: penfever Date: Sun, 5 Jan 2025 14:16:14 -0500 Subject: [PATCH 3/3] alpaca eval glm winrate fix --- .gitignore | 9 ++ .../src/alpaca_eval/metrics/glm_winrate.py | 9 +- .../baseline_gpt4_1106_preview.csv | 2 + json_to_csv.py | 114 ++++++++++++++++++ pyproject.toml | 3 +- 5 files changed, 132 insertions(+), 5 deletions(-) create mode 100644 eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/metrics/weights/weighted_alpaca_eval_gpt-4o-mini-2024-07-18/length_controlled_v1/baseline_gpt4_1106_preview.csv create mode 100644 json_to_csv.py diff --git a/.gitignore b/.gitignore index d865858..c208382 100644 --- a/.gitignore +++ b/.gitignore @@ -225,3 +225,12 @@ eval/chat_benchmarks/MixEval/mix_eval/data/test/ eval/chat_benchmarks/MixEval/mix_eval/data/model_responses eval/chat_benchmarks/MixEval/mix_eval/eval_scripts eval/chat_benchmarks/MixEval/results + +# MT Bench Data +eval/chat_benchmarks/MTBench/fastchat/llm_judge/data + +# Local logs, scripts and results +results +logs +script +offload \ No newline at end of file diff --git a/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/metrics/glm_winrate.py b/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/metrics/glm_winrate.py index 201b9ce..1a1bd75 100644 --- a/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/metrics/glm_winrate.py +++ b/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/metrics/glm_winrate.py @@ -122,9 +122,12 @@ def get_length_controlled_winrate( save_weights_dir.mkdir(exist_ok=True, parents=True) weights_path = save_weights_dir / f"baseline_{baseline_name}.csv" if weights_path.exists(): - saved_weights = pd.read_csv(weights_path, index_col=0) - new_weights = pd.DataFrame(weights, index=[model_name]) - saved_weights = pd.concat([saved_weights, new_weights], axis=0) + try: + saved_weights = pd.read_csv(weights_path, index_col=0) + new_weights = pd.DataFrame(weights, index=[model_name]) + saved_weights = pd.concat([saved_weights, new_weights], axis=0) + except: + saved_weights = pd.DataFrame(weights, index=[model_name]) else: saved_weights = pd.DataFrame(weights, index=[model_name]) saved_weights = saved_weights[~saved_weights.index.duplicated(keep="last")] diff --git a/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/metrics/weights/weighted_alpaca_eval_gpt-4o-mini-2024-07-18/length_controlled_v1/baseline_gpt4_1106_preview.csv b/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/metrics/weights/weighted_alpaca_eval_gpt-4o-mini-2024-07-18/length_controlled_v1/baseline_gpt4_1106_preview.csv new file mode 100644 index 0000000..98b814f --- /dev/null +++ b/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/metrics/weights/weighted_alpaca_eval_gpt-4o-mini-2024-07-18/length_controlled_v1/baseline_gpt4_1106_preview.csv @@ -0,0 +1,2 @@ +,np.tanh(std_delta_len),instruction_difficulty,not_gamed_baseline.astype(float) +"model_hf_model_args_pretrained=penfever__Llama-3-8B-WildChat-500k-8B-2blend,dtype=bfloat16",-1.5944321215706829,0.4904819163456168,-1.6050463989884014 diff --git a/json_to_csv.py b/json_to_csv.py new file mode 100644 index 0000000..8083efb --- /dev/null +++ b/json_to_csv.py @@ -0,0 +1,114 @@ +import json +import csv +import statistics +import argparse +import os +from pathlib import Path + +def process_results(json_file): + # Read and parse JSON + with open(json_file, 'r') as f: + data = json.load(f) + + results = data['results'] + + # Extract MTBench Average + mtbench_avg = results['MTBench']['Average'] + + # Extract Alpaca Eval Length Controlled Winrate + alpaca_winrate = results['alpaca_eval']['length_controlled_winrate'] + + # Extract MMLU Pro + mmlu_pro = results['leaderboard_mmlu_pro']['acc,none'] + + # Extract MixEval + mixeval = results['MixEval']['gpt-4o-mini-2024-07-18']['metrics']["overall"] + + # Extract MBPP + mbpp = results['MBPP']['pass@1'] + + # Calculate average BBH score + bbh_scores = [] + for key, value in results.items(): + if key.startswith('leaderboard_bbh_') and isinstance(value, dict) and 'acc_norm,none' in value: + bbh_scores.append(value['acc_norm,none']) + bbh_avg = statistics.mean(bbh_scores) + + # Calculate average GPQA score + gpqa_scores = [] + for key in ['leaderboard_gpqa_diamond', 'leaderboard_gpqa_extended', 'leaderboard_gpqa_main']: + if key in results: + gpqa_scores.append(results[key]['acc_norm,none']) + gpqa_avg = statistics.mean(gpqa_scores) + + # Calculate average MATH score + math_scores = [] + for key, value in results.items(): + if key.startswith('leaderboard_math_') and isinstance(value, dict) and 'exact_match,none' in value: + math_scores.append(value['exact_match,none']) + math_avg = statistics.mean(math_scores) + + # Calculate average MUSR score + musr_scores = [] + for key, value in results.items(): + if key.startswith('leaderboard_musr_') and isinstance(value, dict) and 'acc_norm,none' in value: + musr_scores.append(value['acc_norm,none']) + musr_avg = statistics.mean(musr_scores) + + # Extract IFEval average (using prompt-level strict accuracy) + ifeval_score = results['leaderboard_ifeval']['prompt_level_strict_acc,none'] + + # Create output dictionary + output = { + 'MTBench': mtbench_avg, + 'Alpaca Eval (LC)': alpaca_winrate, + 'BBH': bbh_avg, + 'GPQA': gpqa_avg, + 'MATH': math_avg, + 'MUSR': musr_avg, + 'IFEval': ifeval_score, + 'MMLU Pro': mmlu_pro, + 'MixEval': mixeval, + 'MBPP': mbpp, + } + + return output + +def main(): + # Set up argument parser + parser = argparse.ArgumentParser(description='Process evaluation results JSON to CSV') + parser.add_argument('--json_path', help='Path to the JSON file to process') + args = parser.parse_args() + + # Convert path to Path object and resolve it + json_path = Path(args.json_path).resolve() + + # Ensure the JSON file exists + if not json_path.exists(): + print(f"Error: File not found: {json_path}") + return + + try: + # Process the results + results = process_results(json_path) + + # Create output path with same name but .csv extension + csv_path = json_path.with_suffix('.csv') + + # Write to CSV + with open(csv_path, 'w', newline='') as f: + writer = csv.writer(f) + writer.writerow(['Metric', 'Value']) + for metric, value in results.items(): + writer.writerow([metric, round(value, 4)]) + + print(f"\nResults have been saved to: {csv_path}") + print("\nSummary of results:") + for metric, value in results.items(): + print(f"{metric}: {round(value, 4)}") + + except Exception as e: + print(f"Error processing file: {e}") + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 824c115..6147033 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,7 +39,7 @@ dependencies = [ "pandas", "scipy", "scikit-learn", - "faiss-cpu==1.7.4", + "faiss-cpu", # ML Experiment Tracking & Visualization "wandb", @@ -153,7 +153,6 @@ dev = [ ] eval = [ # Using exact package names from their setup files - "fschat @ file:eval/chat_benchmarks/MTBench", # Changed to match the likely package name "lm-eval[vllm]@git+https://github.com/EtashGuha/lm-evaluation-harness@etashg/tokenize_fix", ]