Skip to content

Commit

Permalink
update support for claude and minicpm-v, add a answer extractor for m…
Browse files Browse the repository at this point in the history
…odels like o1, a tiny fix of eval metrics
  • Loading branch information
JamesZhutheThird committed Nov 18, 2024
1 parent db67800 commit ccda8fe
Show file tree
Hide file tree
Showing 8 changed files with 310 additions and 46 deletions.
46 changes: 46 additions & 0 deletions eval/answer_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
"""
Extract answers from the output of the o1 model
"""

import json
import pdb

from tqdm import tqdm
from prompts import answer_extractor_prompt
from models.gpt import GPTEvaluator
from args import parse_args_for_answer_extractor

def extract_answer(model,pred):
"""
Extract answers from the output of the o1 model
"""

messages = [{
"role": "system",
"content": answer_extractor_prompt
},{
"role": "system",
"content": pred
}
]

pred_extract = model.generate_response(messages,prepare_inputs=False)

return pred_extract

def main(args):
with open(args.prediction_file, 'r', encoding="utf-8") as f:
pred_data = json.load(f)

model=GPTEvaluator(api_key=args.api_key, model=args.model_version, api_url=args.api_url, max_tokens=500, temperature=0, top_p=1, presence_penalty=0.0, frequency_penalty=0.0)

for pred in tqdm(pred_data.keys()):
pred_extract=extract_answer(model,pred_data[pred]["prediction"])
pred_data[pred]["prediction"]=[pred_extract, pred_data[pred]["prediction"]]

with open(args.prediction_file.replace(".json", "_extracted.json"), 'w', encoding="utf-8") as f:
json.dump(pred_data, f, ensure_ascii=False, indent=4)

if __name__ == "__main__":
args = parse_args_for_answer_extractor()
main(args)
65 changes: 54 additions & 11 deletions eval/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,26 +5,34 @@
import argparse

model_list = {
"gpt-4o": {
"avail_model": ["gpt-4o-2024-08-06","gpt-4o-2024-05-13", "gpt-4o","gpt-4o-mini","gpt-4o-mini-2024-07-18","o1-preview-2024-09-12","o1-mini-2024-09-12"],
"model_type": "api",
"support_input": [0, 1, 2, 3],
"executor": "gpt",
"evaluator": "GPTEvaluator",
"split_sys": True,
},
"gpt-4v": {
"avail_model": ["gpt-4-vision-preview", ],
"avail_model": ["gpt-4-vision-preview"],
"model_type": "api",
"support_input": [2, 3],
"executor": "gpt",
"evaluator": "GPTEvaluator",
"split_sys": True,
},
"gpt": {
"avail_model": ["gpt-3.5-turbo", "gpt-3.5-turbo-0301", "gpt-3.5-turbo-0613", "gpt-3.5-turbo-1106", "gpt-3.5-turbo-16k", "gpt-3.5-turbo-16k-0613", "gpt-4", "gpt-4-0314", "gpt-4-0613","gpt-4-1106-preview", ],
"avail_model": ["gpt-3.5-turbo", "gpt-3.5-turbo-0301", "gpt-3.5-turbo-0613", "gpt-3.5-turbo-1106", "gpt-3.5-turbo-16k", "gpt-3.5-turbo-16k-0613", "gpt-4", "gpt-4-0314", "gpt-4-0613", "gpt-4-1106-preview"],
"model_type": "api",
"support_input": [0, 1],
"executor": "gpt",
"evaluator": "GPTEvaluator",
"split_sys": True,
},
"claude": {
"avail_model": ["claude-3-opus-20240229", "claude-3-sonnet-20240229"],
"avail_model": ["claude-3-opus-20240229", "claude-3-sonnet-20240229","claude-3-5-sonnet-20241022"],
"model_type": "api",
"support_input": [0, 1,2,3],
"support_input": [0, 1, 2, 3],
"executor": "claude",
"evaluator": "ClaudeEvaluator",
"split_sys": True,
Expand Down Expand Up @@ -59,6 +67,13 @@
"evaluator": "VisCPMEvaluator",
"split_sys": False,
},
"minicpmv": {
"model_type": "local",
"support_input": [0, 1, 2, 3],
"executor": "minicpmv",
"evaluator": "MiniCPMEvaluator",
"split_sys": False,
},
"qwen-vl": {
"model_type": "local",
"support_input": [0, 1, 2, 3],
Expand Down Expand Up @@ -97,6 +112,21 @@
}


api_price= { # The price of the model per 1k tokens, [input, output], USD
"gpt-4-vision-preview": [0.01,0.03],
"gpt-3.5-turbo-0125":[0.0005,0.0015],
"gpt-4o": [0.005,0.015],
"gpt-4o-2024-08-06": [0.005,0.015],
"gpt-4o-mini": [0.00015,0.0006],
"gpt-4o-mini-2024-07-18": [0.00015,0.0006],
"o1-mini-2024-09-12": [0.006,0.018],
"o1-preview-2024-09-12": [0.03,0.09],
"gemini-1.5-pro-latest": [0.00125,0.005],
"glm-4v-plus": [0.01,0.01], # CNY
"glm-4v": [0.05,0.05], # CNY
"claude-3-5-sonnet-20241022": [0.005,0.015],
}

def parse_args_for_eval():
parser = argparse.ArgumentParser()

Expand Down Expand Up @@ -126,9 +156,9 @@ def parse_args_for_eval():
parser.add_argument('--cot', action='store_true', help='Whether to use chain-of-thought. The performance using chain-of-thought is not guaranteed.')
parser.add_argument('--few_shot', '-k', type=int, default=0, help='Specify the number of few shot samples. By leaving it empty, it means zero-shot k=0. The performance using few-shot is not guaranteed.')
parser.add_argument('--questions_type', type=str, default="0,1,2",
help='Specify the type of the questions to be tested. 0 - single-answer choosing (SA), 1 - multiple-answer choosing (MA), 2 - fill-in-the-blank (FB), 3 - discussion-questions (OP). By leaving it empty, it means subjective questions [0,1,2].')
help='Specify the type of the questions to be tested. 0 - single-answer choosing (SA), 1 - multiple-answer choosing (MA), 2 - fill-in-the-blank (FB), 3 - discussion-questions (OP). By leaving it empty, it means subjective questions [0,1,2].')
parser.add_argument('--image_type', type=str, default="0,1,2",
help='Specify the number images involved in the questions to be tested. 0 - no-image (NI), 1 - single-image (SI), 2 - multiple-image (MI). By leaving it empty, it means all questions [0,1,2].')
help='Specify the number images involved in the questions to be tested. 0 - no-image (NI), 1 - single-image (SI), 2 - multiple-image (MI). By leaving it empty, it means all questions [0,1,2].')
parser.add_argument('--subset', type=str, default=None, help='The path to the list of the problems to be tested. Use "../data/hard_list_v1.2.1_20240206.json" to test on MULTI-Elite.')
parser.add_argument('--subject', type=str, default=None, help='Specify the subject of the problems to be tested.')
parser.add_argument('--input_type', type=int, choices=range(0, 4), default=0, help='Specify the input type. 0 - pure-text, 1 - text-with-captions/ocr, 2 - text-and-images, 3 - only-images. By leaving it empty, it means pure_text.')
Expand All @@ -150,6 +180,18 @@ def parse_args_for_eval():
return args


def parse_args_for_answer_extractor():
parser = argparse.ArgumentParser()

parser.add_argument('--prediction_file', type=str, default=None, help='Specify the prediction json file.')
parser.add_argument('--model_version', '-v', type=str, default="gpt-4o-mini", help='Specify the model type. You need to fill in this if you want to test specific model version for GPTs.')
parser.add_argument('--api_key', type=str, default=None, help='Specify the api key. You need to fill in this if you want to test those models that are not deployed locally.')
parser.add_argument('--api_url', type=str, default="https://api.openai.com/v1/chat/completions", help='Specify the api url. You need to fill in this if you want to test those models that are not deployed locally.')

args = parser.parse_args()
return args


def parse_args_for_score():
parser = argparse.ArgumentParser()

Expand All @@ -169,6 +211,7 @@ def parse_args_for_score():
args = parser.parse_args()
return args


def parse_args_for_score_deploy():
class Args:
pass
Expand All @@ -181,22 +224,22 @@ class Args:
args.prediction_file = None
args.score_file = None
args.reference_dir = None

# score setting
args.detail = False
args.only_past = False

# other functions
args.model_list = False

return args


def print_model_list():
print('='*20)
print('=' * 20)
for model_name in model_list:
print(f'[{model_name}]')
print(' ',model_list[model_name])
print(' ', model_list[model_name])
# versions = model_list[model_name].get("avail_model", [])
# if len(versions) > 0:
# print(f"Available versions: {versions}")
Expand Down
55 changes: 37 additions & 18 deletions eval/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import glob
import pdb
import sys
from args import print_model_list, parse_args_for_eval, model_list
from args import print_model_list, parse_args_for_eval, model_list, api_price
import time
import importlib
import importlib.util
Expand Down Expand Up @@ -85,28 +85,46 @@ def evaluate(args, evaluator, questions):

save_checkpoints(args, questions_with_answers, 0)

if args.model_version in api_price.keys():
tokens_calculate = {
"prompt_tokens": 0,
"completion_tokens": 0
}
for question_id, question in questions_with_answers.items():
tokens_calculate["prompt_tokens"] += question.get(["prompt_tokens"],0)
tokens_calculate["completion_tokens"] += question.get(["completion_tokens"],0)

price = api_price.get(args.model_version, [0.005, 0.015])
print(f"Total prompt tokens: {tokens_calculate['prompt_tokens']}, Total completion tokens: {tokens_calculate['completion_tokens']}, Total cost: ${'{0:.5f}'.format(tokens_calculate['prompt_tokens'] / 1000 * price[0] + tokens_calculate['completion_tokens'] / 1000 * price[1])}")




def generate_data(args):
questions = prepare_questions(args)
prompted_questions = get_prompts(questions, args)

# calculate the number of tokens
if args.model_version:
try:
encoding = tiktoken.encoding_for_model(args.model_version)
except:
try:
# calculate the number of tokens
if args.model_version:
try:
encoding = tiktoken.encoding_for_model(args.model_version)
except:
encoding = tiktoken.encoding_for_model('gpt-3.5-turbo-0613')
else:
encoding = tiktoken.encoding_for_model('gpt-3.5-turbo-0613')
else:
encoding = tiktoken.encoding_for_model('gpt-3.5-turbo-0613')
input_token_num = 0
input_image_num = 0
for question_id, question in tqdm(prompted_questions.items()):
input_token_num += len(encoding.encode(question.get("prompted_system_content", ""))) + len(encoding.encode(question.get("prompted_user_content", ""))) + len(encoding.encode(question.get("prompted_content", ""))) + len(
encoding.encode(" ".join(question.get("prompted_content_list", []))))
input_image_num += question["question_image_number"]

print(f"Total number of tokens: {input_token_num}")
print(f"Total number of images: {input_image_num}")
input_token_num = 0
input_image_num = 0
for question_id, question in tqdm(prompted_questions.items()):
input_token_num += len(encoding.encode(question.get("prompted_system_content", ""))) + len(encoding.encode(question.get("prompted_user_content", ""))) + len(encoding.encode(question.get("prompted_content", ""))) + len(
encoding.encode(" ".join(question.get("prompted_content_list", []))))
input_image_num += question["question_image_number"]

print(f"Total number of tokens: {input_token_num}")
print(f"Total number of images: {input_image_num}")
except Exception as e:
print(f"Error {e} occurred during token calculation.")

return prompted_questions


Expand All @@ -120,7 +138,8 @@ def get_evaluator(args):
evaluator_module = importlib.import_module(module_pos)
Evaluator = getattr(evaluator_module, model_list[args.model]["evaluator"])
print(f"Using evaluator {model_list[args.model]['evaluator']} from {module_pos}")
except:
except Exception as e:
print(e)
print(f"Module \"{model_list[args.model]['evaluator']}\" for evaluation not found in {module_pos}. Please check your implementation.")
sys.exit(0)

Expand Down
22 changes: 13 additions & 9 deletions eval/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,11 @@

def SingleAnswerChoiceEval(pred, label):
"""
提取输出中出现的第一个英文字母作为答案
提取输出中出现的最后一个英文字母作为答案
"""
match = re.search(r'[a-zA-Z]', pred)
if match:
answer = match.group(0)
matches = re.findall(r'[a-zA-Z]', pred)
if matches:
answer = matches[0].upper()
score = 1 if answer == label else 0
else:
score = 0
Expand All @@ -36,10 +36,10 @@ def MultipleAnswersChoiceEval(pred, label):
每选择一个正确选项+1分,若选择错误选项则直接得0分
分数不进行归一化处理
"""
match = re.search(r'[a-zA-Z ,]+', pred)
matches = re.findall(r'[a-zA-Z ,]+[a-zA-Z]*[a-zA-Z ,]+', pred)
score = 0
if match:
answer = match.group(0)
if matches:
answer = matches[0].upper()
answer = answer.replace(' ', '').replace(',', '').replace('、', '')
answer = ''.join(sorted(set(answer), key=answer.index))
for choice in answer:
Expand Down Expand Up @@ -121,9 +121,13 @@ def evaluate_every_problem(args):
problem_id, sub_id = item['question_id'].rsplit('_', 1)
label = label_data[problem_id]["problem_answer_list"][int(sub_id)].strip()
type = label_data[problem_id]["problem_type_list"][int(sub_id)]
prediction=item['prediction']

if re.findall(r'Thought,持续 [0-9]+ 秒', prediction):
prediction = re.split(r'Thought,持续 [0-9]+ 秒', prediction)[-1].strip()

if type in EvaluateFuncDict:
score, total_score = EvaluateFuncDict[type](item['prediction'], label)
score, total_score = EvaluateFuncDict[type](prediction, label)
else:
score, total_score = 0, 0

Expand All @@ -140,6 +144,7 @@ def evaluate_every_problem(args):
pred_data[item['question_id']]["education"] = label_data[problem_id]["education"]
pred_data[item['question_id']]["subject"] = label_data[problem_id]["subject"][0]


with open(args.prediction_file.replace('prediction.json', 'score.json'), 'w', encoding="utf-8") as f:
json.dump(score_data, f, indent=4, ensure_ascii=False)

Expand All @@ -162,7 +167,6 @@ def calculate_score(args):

return (absolute_score, total_absolute_score, absolute_score / total_absolute_score * 100)

# TODO: add a relative method to calculate scores, this method should be applied to single calculation as absolute score


def init_dict(detail_data, education, subject):
Expand Down
27 changes: 26 additions & 1 deletion eval/models/claude.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"""Anthropic Claude Evaluator"""

import httpx
from anthropic import Anthropic
import requests
import json
from tqdm import tqdm
Expand All @@ -10,6 +9,7 @@
import pdb
from utils import encode_image_base64
import re
from args import api_price


class ClaudeEvaluator:
Expand All @@ -18,6 +18,7 @@ def __init__(self, api_key, model='claude-3-opus-20240229', api_url=None, max_to
self.api_key = api_key
self.api_url = api_url
if self.use_client:
from anthropic import Anthropic
self.client = Anthropic(api_key=self.api_key ,base_url=self.api_url) # http_client=httpx.Client(proxies=api_url, transport=httpx.HTTPTransport(local_address="0.0.0.0"))
else:
self.header = {
Expand All @@ -35,6 +36,28 @@ def __init__(self, api_key, model='claude-3-opus-20240229', api_url=None, max_to
"frequency_penalty": frequency_penalty,
}
self.model = model
self.tokens = {
"prompt_tokens": 0,
"completion_tokens": 0
}
self.tokens_this_run = {
"prompt_tokens": 0,
"completion_tokens": 0
}
self.price = api_price.get(model, [0.005, 0.015])

def calculate_usage(self, response):
prompt_tokens = response["usage"]["prompt_tokens"]
completion_tokens = response["usage"].get("completion_tokens", 0)
self.tokens["prompt_tokens"] += prompt_tokens
self.tokens["completion_tokens"] += completion_tokens
self.tokens_this_run["prompt_tokens"] += prompt_tokens
self.tokens_this_run["completion_tokens"] += completion_tokens
print(f"Prompt tokens: {prompt_tokens}, Completion tokens: {completion_tokens}, Cost: ${'{0:.5f}'.format(prompt_tokens / 1000 * self.price[0] + completion_tokens / 1000 * self.price[1])}")
return prompt_tokens, completion_tokens

def calculate_usage_total(self):
print(f"Total prompt tokens: {self.tokens['prompt_tokens']}, Total completion tokens: {self.tokens['completion_tokens']}, Total cost: ${'{0:.5f}'.format(self.tokens['prompt_tokens'] / 1000 * self.price[0] + self.tokens['completion_tokens'] / 1000 * self.price[1])}")

def prepare_inputs(self, question):
image_list = question.get("image_list")
Expand Down Expand Up @@ -98,6 +121,8 @@ def generate_response(self, question):
response_ = requests.post(self.api_url, json=self.post_dict, headers=self.header)
response_ = response_.json()
response = response_["choices"][0]["message"]["content"]
print(response_)
self.calculate_usage(response_)
except KeyboardInterrupt:
raise Exception("Terminated by user.")
except Exception as e:
Expand Down
Loading

0 comments on commit ccda8fe

Please sign in to comment.