Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dependency conflicts #44

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -225,3 +225,12 @@ eval/chat_benchmarks/MixEval/mix_eval/data/test/
eval/chat_benchmarks/MixEval/mix_eval/data/model_responses
eval/chat_benchmarks/MixEval/mix_eval/eval_scripts
eval/chat_benchmarks/MixEval/results

# MT Bench Data
eval/chat_benchmarks/MTBench/fastchat/llm_judge/data

# Local logs, scripts and results
results
logs
script
offload
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ conda activate evalchemy
# Install dependencies
pip install -e ".[eval]"
pip install -e eval/chat_benchmarks/alpaca_eval
pip install -e eval/chat_benchmarks/MTBench

# Log into HuggingFace for datasets and models.
huggingface-cli login
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -122,9 +122,12 @@ def get_length_controlled_winrate(
save_weights_dir.mkdir(exist_ok=True, parents=True)
weights_path = save_weights_dir / f"baseline_{baseline_name}.csv"
if weights_path.exists():
saved_weights = pd.read_csv(weights_path, index_col=0)
new_weights = pd.DataFrame(weights, index=[model_name])
saved_weights = pd.concat([saved_weights, new_weights], axis=0)
try:
saved_weights = pd.read_csv(weights_path, index_col=0)
new_weights = pd.DataFrame(weights, index=[model_name])
saved_weights = pd.concat([saved_weights, new_weights], axis=0)
except:
saved_weights = pd.DataFrame(weights, index=[model_name])
else:
saved_weights = pd.DataFrame(weights, index=[model_name])
saved_weights = saved_weights[~saved_weights.index.duplicated(keep="last")]
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
,np.tanh(std_delta_len),instruction_difficulty,not_gamed_baseline.astype(float)
"model_hf_model_args_pretrained=penfever__Llama-3-8B-WildChat-500k-8B-2blend,dtype=bfloat16",-1.5944321215706829,0.4904819163456168,-1.6050463989884014
114 changes: 114 additions & 0 deletions json_to_csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
import json
import csv
import statistics
import argparse
import os
from pathlib import Path

def process_results(json_file):
# Read and parse JSON
with open(json_file, 'r') as f:
data = json.load(f)

results = data['results']

# Extract MTBench Average
mtbench_avg = results['MTBench']['Average']

# Extract Alpaca Eval Length Controlled Winrate
alpaca_winrate = results['alpaca_eval']['length_controlled_winrate']

# Extract MMLU Pro
mmlu_pro = results['leaderboard_mmlu_pro']['acc,none']

# Extract MixEval
mixeval = results['MixEval']['gpt-4o-mini-2024-07-18']['metrics']["overall"]

# Extract MBPP
mbpp = results['MBPP']['pass@1']

# Calculate average BBH score
bbh_scores = []
for key, value in results.items():
if key.startswith('leaderboard_bbh_') and isinstance(value, dict) and 'acc_norm,none' in value:
bbh_scores.append(value['acc_norm,none'])
bbh_avg = statistics.mean(bbh_scores)

# Calculate average GPQA score
gpqa_scores = []
for key in ['leaderboard_gpqa_diamond', 'leaderboard_gpqa_extended', 'leaderboard_gpqa_main']:
if key in results:
gpqa_scores.append(results[key]['acc_norm,none'])
gpqa_avg = statistics.mean(gpqa_scores)

# Calculate average MATH score
math_scores = []
for key, value in results.items():
if key.startswith('leaderboard_math_') and isinstance(value, dict) and 'exact_match,none' in value:
math_scores.append(value['exact_match,none'])
math_avg = statistics.mean(math_scores)

# Calculate average MUSR score
musr_scores = []
for key, value in results.items():
if key.startswith('leaderboard_musr_') and isinstance(value, dict) and 'acc_norm,none' in value:
musr_scores.append(value['acc_norm,none'])
musr_avg = statistics.mean(musr_scores)

# Extract IFEval average (using prompt-level strict accuracy)
ifeval_score = results['leaderboard_ifeval']['prompt_level_strict_acc,none']

# Create output dictionary
output = {
'MTBench': mtbench_avg,
'Alpaca Eval (LC)': alpaca_winrate,
'BBH': bbh_avg,
'GPQA': gpqa_avg,
'MATH': math_avg,
'MUSR': musr_avg,
'IFEval': ifeval_score,
'MMLU Pro': mmlu_pro,
'MixEval': mixeval,
'MBPP': mbpp,
}

return output

def main():
# Set up argument parser
parser = argparse.ArgumentParser(description='Process evaluation results JSON to CSV')
parser.add_argument('--json_path', help='Path to the JSON file to process')
args = parser.parse_args()

# Convert path to Path object and resolve it
json_path = Path(args.json_path).resolve()

# Ensure the JSON file exists
if not json_path.exists():
print(f"Error: File not found: {json_path}")
return

try:
# Process the results
results = process_results(json_path)

# Create output path with same name but .csv extension
csv_path = json_path.with_suffix('.csv')

# Write to CSV
with open(csv_path, 'w', newline='') as f:
writer = csv.writer(f)
writer.writerow(['Metric', 'Value'])
for metric, value in results.items():
writer.writerow([metric, round(value, 4)])

print(f"\nResults have been saved to: {csv_path}")
print("\nSummary of results:")
for metric, value in results.items():
print(f"{metric}: {round(value, 4)}")

except Exception as e:
print(f"Error processing file: {e}")

if __name__ == '__main__':
main()
3 changes: 1 addition & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ dependencies = [
"pandas",
"scipy",
"scikit-learn",
"faiss-cpu==1.7.4",
"faiss-cpu",

# ML Experiment Tracking & Visualization
"wandb",
Expand Down Expand Up @@ -153,7 +153,6 @@ dev = [
]
eval = [
# Using exact package names from their setup files
"fschat @ file:eval/chat_benchmarks/MTBench", # Changed to match the likely package name
"lm-eval[vllm]@git+https://github.com/EtashGuha/lm-evaluation-harness@etashg/tokenize_fix",
]

Expand Down