update support for claude and minicpm-v, add a answer extractor for m…

…odels like o1, a tiny fix of eval metrics
OpenDFM · Nov 18, 2024 · ccda8fe · ccda8fe
1 parent db67800
commit ccda8fe
Show file tree

Hide file tree

Showing 8 changed files with 310 additions and 46 deletions.
diff --git a/eval/answer_extractor.py b/eval/answer_extractor.py
@@ -0,0 +1,46 @@
+"""
+Extract answers from the output of the o1 model
+"""
+
+import json
+import pdb
+
+from tqdm import tqdm
+from prompts import answer_extractor_prompt
+from models.gpt import GPTEvaluator
+from args import parse_args_for_answer_extractor
+
+def extract_answer(model,pred):
+    """
+    Extract answers from the output of the o1 model
+    """
+
+    messages = [{
+        "role": "system",
+        "content":  answer_extractor_prompt
+    },{
+        "role": "system",
+        "content":  pred
+    }
+    ]
+
+    pred_extract = model.generate_response(messages,prepare_inputs=False)
+
+    return pred_extract
+
+def main(args):
+    with open(args.prediction_file, 'r', encoding="utf-8") as f:
+        pred_data = json.load(f)
+
+    model=GPTEvaluator(api_key=args.api_key, model=args.model_version, api_url=args.api_url, max_tokens=500, temperature=0, top_p=1, presence_penalty=0.0, frequency_penalty=0.0)
+
+    for pred in tqdm(pred_data.keys()):
+        pred_extract=extract_answer(model,pred_data[pred]["prediction"])
+        pred_data[pred]["prediction"]=[pred_extract, pred_data[pred]["prediction"]]
+
+    with open(args.prediction_file.replace(".json", "_extracted.json"), 'w', encoding="utf-8") as f:
+        json.dump(pred_data, f, ensure_ascii=False, indent=4)
+
+if __name__ == "__main__":
+    args = parse_args_for_answer_extractor()
+    main(args)
diff --git a/eval/args.py b/eval/args.py
@@ -5,26 +5,34 @@
 import argparse
 
 model_list = {
+    "gpt-4o": {
+        "avail_model": ["gpt-4o-2024-08-06","gpt-4o-2024-05-13", "gpt-4o","gpt-4o-mini","gpt-4o-mini-2024-07-18","o1-preview-2024-09-12","o1-mini-2024-09-12"],
+        "model_type": "api",
+        "support_input": [0, 1, 2, 3],
+        "executor": "gpt",
+        "evaluator": "GPTEvaluator",
+        "split_sys": True,
+    },
     "gpt-4v": {
-        "avail_model": ["gpt-4-vision-preview", ],
+        "avail_model": ["gpt-4-vision-preview"],
         "model_type": "api",
         "support_input": [2, 3],
         "executor": "gpt",
         "evaluator": "GPTEvaluator",
         "split_sys": True,
     },
     "gpt": {
-        "avail_model": ["gpt-3.5-turbo", "gpt-3.5-turbo-0301", "gpt-3.5-turbo-0613", "gpt-3.5-turbo-1106", "gpt-3.5-turbo-16k", "gpt-3.5-turbo-16k-0613", "gpt-4", "gpt-4-0314", "gpt-4-0613","gpt-4-1106-preview", ],
+        "avail_model": ["gpt-3.5-turbo", "gpt-3.5-turbo-0301", "gpt-3.5-turbo-0613", "gpt-3.5-turbo-1106", "gpt-3.5-turbo-16k", "gpt-3.5-turbo-16k-0613", "gpt-4", "gpt-4-0314", "gpt-4-0613", "gpt-4-1106-preview"],
         "model_type": "api",
         "support_input": [0, 1],
         "executor": "gpt",
         "evaluator": "GPTEvaluator",
         "split_sys": True,
     },
     "claude": {
-        "avail_model": ["claude-3-opus-20240229", "claude-3-sonnet-20240229"],
+        "avail_model": ["claude-3-opus-20240229", "claude-3-sonnet-20240229","claude-3-5-sonnet-20241022"],
         "model_type": "api",
-        "support_input": [0, 1,2,3],
+        "support_input": [0, 1, 2, 3],
         "executor": "claude",
         "evaluator": "ClaudeEvaluator",
         "split_sys": True,
@@ -59,6 +67,13 @@
         "evaluator": "VisCPMEvaluator",
         "split_sys": False,
     },
+    "minicpmv": {
+        "model_type": "local",
+        "support_input": [0, 1, 2, 3],
+        "executor": "minicpmv",
+        "evaluator": "MiniCPMEvaluator",
+        "split_sys": False,
+    },
     "qwen-vl": {
         "model_type": "local",
         "support_input": [0, 1, 2, 3],
@@ -97,6 +112,21 @@
 }
 
 
+api_price= { # The price of the model per 1k tokens, [input, output], USD
+    "gpt-4-vision-preview": [0.01,0.03],
+    "gpt-3.5-turbo-0125":[0.0005,0.0015],
+    "gpt-4o": [0.005,0.015],
+    "gpt-4o-2024-08-06": [0.005,0.015],
+    "gpt-4o-mini": [0.00015,0.0006],
+    "gpt-4o-mini-2024-07-18": [0.00015,0.0006],
+    "o1-mini-2024-09-12": [0.006,0.018],
+    "o1-preview-2024-09-12": [0.03,0.09],
+    "gemini-1.5-pro-latest": [0.00125,0.005],
+    "glm-4v-plus": [0.01,0.01], # CNY
+    "glm-4v": [0.05,0.05], # CNY
+    "claude-3-5-sonnet-20241022": [0.005,0.015],
+}
+
 def parse_args_for_eval():
     parser = argparse.ArgumentParser()
 
@@ -126,9 +156,9 @@ def parse_args_for_eval():
     parser.add_argument('--cot', action='store_true', help='Whether to use chain-of-thought. The performance using chain-of-thought is not guaranteed.')
     parser.add_argument('--few_shot', '-k', type=int, default=0, help='Specify the number of few shot samples. By leaving it empty, it means zero-shot k=0. The performance using few-shot is not guaranteed.')
     parser.add_argument('--questions_type', type=str, default="0,1,2",
-        help='Specify the type of the questions to be tested. 0 - single-answer choosing (SA), 1 - multiple-answer choosing (MA), 2 - fill-in-the-blank (FB), 3 - discussion-questions (OP). By leaving it empty, it means subjective questions [0,1,2].')
+                        help='Specify the type of the questions to be tested. 0 - single-answer choosing (SA), 1 - multiple-answer choosing (MA), 2 - fill-in-the-blank (FB), 3 - discussion-questions (OP). By leaving it empty, it means subjective questions [0,1,2].')
     parser.add_argument('--image_type', type=str, default="0,1,2",
-        help='Specify the number images involved in the questions to be tested. 0 - no-image (NI), 1 - single-image (SI), 2 - multiple-image (MI). By leaving it empty, it means all questions [0,1,2].')
+                        help='Specify the number images involved in the questions to be tested. 0 - no-image (NI), 1 - single-image (SI), 2 - multiple-image (MI). By leaving it empty, it means all questions [0,1,2].')
     parser.add_argument('--subset', type=str, default=None, help='The path to the list of the problems to be tested. Use "../data/hard_list_v1.2.1_20240206.json" to test on MULTI-Elite.')
     parser.add_argument('--subject', type=str, default=None, help='Specify the subject of the problems to be tested.')
     parser.add_argument('--input_type', type=int, choices=range(0, 4), default=0, help='Specify the input type. 0 - pure-text, 1 - text-with-captions/ocr, 2 - text-and-images, 3 - only-images. By leaving it empty, it means pure_text.')
@@ -150,6 +180,18 @@ def parse_args_for_eval():
     return args
 
 
+def parse_args_for_answer_extractor():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--prediction_file', type=str, default=None, help='Specify the prediction json file.')
+    parser.add_argument('--model_version', '-v', type=str, default="gpt-4o-mini", help='Specify the model type. You need to fill in this if you want to test specific model version for GPTs.')
+    parser.add_argument('--api_key', type=str, default=None, help='Specify the api key. You need to fill in this if you want to test those models that are not deployed locally.')
+    parser.add_argument('--api_url', type=str, default="https://api.openai.com/v1/chat/completions", help='Specify the api url. You need to fill in this if you want to test those models that are not deployed locally.')
+
+    args = parser.parse_args()
+    return args
+
+
 def parse_args_for_score():
     parser = argparse.ArgumentParser()
 
@@ -169,6 +211,7 @@ def parse_args_for_score():
     args = parser.parse_args()
     return args
 
+
 def parse_args_for_score_deploy():
     class Args:
         pass
@@ -181,22 +224,22 @@ class Args:
     args.prediction_file = None
     args.score_file = None
     args.reference_dir = None
-    
+
     # score setting
     args.detail = False
     args.only_past = False
-    
+
     # other functions
     args.model_list = False
-    
+
     return args
 
 
 def print_model_list():
-    print('='*20)
+    print('=' * 20)
     for model_name in model_list:
         print(f'[{model_name}]')
-        print('  ',model_list[model_name])
+        print('  ', model_list[model_name])
         # versions = model_list[model_name].get("avail_model", [])
         # if len(versions) > 0:
         #     print(f"Available versions: {versions}")

diff --git a/eval/eval.py b/eval/eval.py
@@ -6,7 +6,7 @@
 import glob
 import pdb
 import sys
-from args import print_model_list, parse_args_for_eval, model_list
+from args import print_model_list, parse_args_for_eval, model_list, api_price
 import time
 import importlib
 import importlib.util
@@ -85,28 +85,46 @@ def evaluate(args, evaluator, questions):
 
     save_checkpoints(args, questions_with_answers, 0)
 
+    if args.model_version in api_price.keys():
+        tokens_calculate = {
+            "prompt_tokens": 0,
+            "completion_tokens": 0
+        }
+        for question_id, question in questions_with_answers.items():
+            tokens_calculate["prompt_tokens"] += question.get(["prompt_tokens"],0)
+            tokens_calculate["completion_tokens"] += question.get(["completion_tokens"],0)
+
+        price = api_price.get(args.model_version, [0.005, 0.015])
+        print(f"Total prompt tokens: {tokens_calculate['prompt_tokens']}, Total completion tokens: {tokens_calculate['completion_tokens']}, Total cost: ${'{0:.5f}'.format(tokens_calculate['prompt_tokens'] / 1000 * price[0] + tokens_calculate['completion_tokens'] / 1000 * price[1])}")
+
+
+
 
 def generate_data(args):
     questions = prepare_questions(args)
     prompted_questions = get_prompts(questions, args)
 
-    # calculate the number of tokens
-    if args.model_version:
-        try:
-            encoding = tiktoken.encoding_for_model(args.model_version)
-        except:
+    try:
+        # calculate the number of tokens
+        if args.model_version:
+            try:
+                encoding = tiktoken.encoding_for_model(args.model_version)
+            except:
+                encoding = tiktoken.encoding_for_model('gpt-3.5-turbo-0613')
+        else:
             encoding = tiktoken.encoding_for_model('gpt-3.5-turbo-0613')
-    else:
-        encoding = tiktoken.encoding_for_model('gpt-3.5-turbo-0613')
-    input_token_num = 0
-    input_image_num = 0
-    for question_id, question in tqdm(prompted_questions.items()):
-        input_token_num += len(encoding.encode(question.get("prompted_system_content", ""))) + len(encoding.encode(question.get("prompted_user_content", ""))) + len(encoding.encode(question.get("prompted_content", ""))) + len(
-            encoding.encode(" ".join(question.get("prompted_content_list", []))))
-        input_image_num += question["question_image_number"]
-
-    print(f"Total number of tokens: {input_token_num}")
-    print(f"Total number of images: {input_image_num}")
+        input_token_num = 0
+        input_image_num = 0
+        for question_id, question in tqdm(prompted_questions.items()):
+            input_token_num += len(encoding.encode(question.get("prompted_system_content", ""))) + len(encoding.encode(question.get("prompted_user_content", ""))) + len(encoding.encode(question.get("prompted_content", ""))) + len(
+                encoding.encode(" ".join(question.get("prompted_content_list", []))))
+            input_image_num += question["question_image_number"]
+
+        print(f"Total number of tokens: {input_token_num}")
+        print(f"Total number of images: {input_image_num}")
+    except Exception as e:
+        print(f"Error {e} occurred during token calculation.")
+
     return prompted_questions
 
 
@@ -120,7 +138,8 @@ def get_evaluator(args):
         evaluator_module = importlib.import_module(module_pos)
         Evaluator = getattr(evaluator_module, model_list[args.model]["evaluator"])
         print(f"Using evaluator {model_list[args.model]['evaluator']} from {module_pos}")
-    except:
+    except Exception as e:
+        print(e)
         print(f"Module \"{model_list[args.model]['evaluator']}\" for evaluation not found in {module_pos}. Please check your implementation.")
         sys.exit(0)
 

diff --git a/eval/metrics.py b/eval/metrics.py
@@ -19,11 +19,11 @@
 
 def SingleAnswerChoiceEval(pred, label):
     """
-    提取输出中出现的第一个英文字母作为答案
+    提取输出中出现的最后一个英文字母作为答案
     """
-    match = re.search(r'[a-zA-Z]', pred)
-    if match:
-        answer = match.group(0)
+    matches = re.findall(r'[a-zA-Z]', pred)
+    if matches:
+        answer = matches[0].upper()
         score = 1 if answer == label else 0
     else:
         score = 0
@@ -36,10 +36,10 @@ def MultipleAnswersChoiceEval(pred, label):
     每选择一个正确选项+1分，若选择错误选项则直接得0分
     分数不进行归一化处理
     """
-    match = re.search(r'[a-zA-Z ,]+', pred)
+    matches = re.findall(r'[a-zA-Z ,]+[a-zA-Z]*[a-zA-Z ,]+', pred)
     score = 0
-    if match:
-        answer = match.group(0)
+    if matches:
+        answer = matches[0].upper()
         answer = answer.replace(' ', '').replace(',', '').replace('、', '')
         answer = ''.join(sorted(set(answer), key=answer.index))
         for choice in answer:
@@ -121,9 +121,13 @@ def evaluate_every_problem(args):
         problem_id, sub_id = item['question_id'].rsplit('_', 1)
         label = label_data[problem_id]["problem_answer_list"][int(sub_id)].strip()
         type = label_data[problem_id]["problem_type_list"][int(sub_id)]
+        prediction=item['prediction']
+
+        if re.findall(r'Thought，持续 [0-9]+ 秒', prediction):
+            prediction = re.split(r'Thought，持续 [0-9]+ 秒', prediction)[-1].strip()
 
         if type in EvaluateFuncDict:
-            score, total_score = EvaluateFuncDict[type](item['prediction'], label)
+            score, total_score = EvaluateFuncDict[type](prediction, label)
         else:
             score, total_score = 0, 0
 
@@ -140,6 +144,7 @@ def evaluate_every_problem(args):
         pred_data[item['question_id']]["education"] = label_data[problem_id]["education"]
         pred_data[item['question_id']]["subject"] = label_data[problem_id]["subject"][0]
 
+
     with open(args.prediction_file.replace('prediction.json', 'score.json'), 'w', encoding="utf-8") as f:
         json.dump(score_data, f, indent=4, ensure_ascii=False)
 
@@ -162,7 +167,6 @@ def calculate_score(args):
 
     return (absolute_score, total_absolute_score, absolute_score / total_absolute_score * 100)
 
-    # TODO: add a relative method to calculate scores, this method should be applied to single calculation as absolute score
 
 
 def init_dict(detail_data, education, subject):

diff --git a/eval/models/claude.py b/eval/models/claude.py
@@ -1,7 +1,6 @@
 """Anthropic Claude Evaluator"""
 
 import httpx
-from anthropic import Anthropic
 import requests
 import json
 from tqdm import tqdm
@@ -10,6 +9,7 @@
 import pdb
 from utils import encode_image_base64
 import re
+from args import api_price
 
 
 class ClaudeEvaluator:
@@ -18,6 +18,7 @@ def __init__(self, api_key, model='claude-3-opus-20240229', api_url=None, max_to
         self.api_key = api_key
         self.api_url = api_url
         if self.use_client:
+            from anthropic import Anthropic
             self.client = Anthropic(api_key=self.api_key ,base_url=self.api_url) # http_client=httpx.Client(proxies=api_url, transport=httpx.HTTPTransport(local_address="0.0.0.0"))
         else:
             self.header = {
@@ -35,6 +36,28 @@ def __init__(self, api_key, model='claude-3-opus-20240229', api_url=None, max_to
                 "frequency_penalty": frequency_penalty,
             }
         self.model = model
+        self.tokens = {
+            "prompt_tokens": 0,
+            "completion_tokens": 0
+        }
+        self.tokens_this_run = {
+            "prompt_tokens": 0,
+            "completion_tokens": 0
+        }
+        self.price = api_price.get(model, [0.005, 0.015])
+
+    def calculate_usage(self, response):
+        prompt_tokens = response["usage"]["prompt_tokens"]
+        completion_tokens = response["usage"].get("completion_tokens", 0)
+        self.tokens["prompt_tokens"] += prompt_tokens
+        self.tokens["completion_tokens"] += completion_tokens
+        self.tokens_this_run["prompt_tokens"] += prompt_tokens
+        self.tokens_this_run["completion_tokens"] += completion_tokens
+        print(f"Prompt tokens: {prompt_tokens}, Completion tokens: {completion_tokens}, Cost: ${'{0:.5f}'.format(prompt_tokens / 1000 * self.price[0] + completion_tokens / 1000 * self.price[1])}")
+        return prompt_tokens, completion_tokens
+
+    def calculate_usage_total(self):
+        print(f"Total prompt tokens: {self.tokens['prompt_tokens']}, Total completion tokens: {self.tokens['completion_tokens']}, Total cost: ${'{0:.5f}'.format(self.tokens['prompt_tokens'] / 1000 * self.price[0] + self.tokens['completion_tokens'] / 1000 * self.price[1])}")
 
     def prepare_inputs(self, question):
         image_list = question.get("image_list")
@@ -98,6 +121,8 @@ def generate_response(self, question):
                     response_ = requests.post(self.api_url, json=self.post_dict, headers=self.header)
                     response_ = response_.json()
                     response = response_["choices"][0]["message"]["content"]
+                    print(response_)
+                    self.calculate_usage(response_)
             except KeyboardInterrupt:
                 raise Exception("Terminated by user.")
             except Exception as e: