Merge branch 'main' into feature-customhandler

ShishirPatil · Nov 15, 2024 · 99c9f09 · 99c9f09
2 parents 11c4e8d + 4b6b7d0
commit 99c9f09
Show file tree

Hide file tree

Showing 9 changed files with 26 additions and 34 deletions.
diff --git a/berkeley-function-call-leaderboard/CHANGELOG.md b/berkeley-function-call-leaderboard/CHANGELOG.md
@@ -2,7 +2,10 @@
 
 All notable changes to the Berkeley Function Calling Leaderboard will be documented in this file.
 
+- [Nov 15, 2024] [#762](https://github.com/ShishirPatil/gorilla/pull/762): Supply `data_multi_turn.csv` for multi-turn evaluation results
+- [Nov 14, 2024] [#747](https://github.com/ShishirPatil/gorilla/pull/747): Minor Grammatical Corrections to `DEFAULT_SYSTEM_PROMPT` that is supplied to all prompting models.
 - [Nov 13, 2024] [#737](https://github.com/ShishirPatil/gorilla/pull/737), [#739](https://github.com/ShishirPatil/gorilla/pull/739), [#740](https://github.com/ShishirPatil/gorilla/pull/740): Bug fix in the dataset and possible answers for the live and multi-turn categories.
+- [Nov 9, 2024] [#749](https://github.com/ShishirPatil/gorilla/pull/749): Remove `Llama-3.2-3B-Instruct-FC` and `Llama-3.2-1B-Instruct-FC` from the leaderboard. According to the [official Llama documentation](https://www.llama.com/docs/model-cards-and-prompt-formats/llama3_2#-tool-calling-(1b/3b)-), these models perform function calling using the prompt-style chat template rather than the specialized function-calling format.
 - [Nov 8, 2024] [#720](https://github.com/ShishirPatil/gorilla/pull/720): Add new model `BitAgent/GoGoAgent` to the leaderboard.
 - [Oct 30, 2024] [#725](https://github.com/ShishirPatil/gorilla/pull/725), [#733](https://github.com/ShishirPatil/gorilla/pull/733): Update evaluation metric for multi-turn categories:
   - Introduce a new response-based checker, which works alongside with the existing state-based checker.

diff --git a/berkeley-function-call-leaderboard/README.md b/berkeley-function-call-leaderboard/README.md
@@ -182,7 +182,6 @@ Below is _a table of models we support_ to run our leaderboard evaluation agains
 |meta-llama/Meta-Llama-3-{8B,70B}-Instruct 💻| Prompt|
 |meta-llama/Llama-3.1-{8B,70B}-Instruct-FC 💻| Function Calling|
 |meta-llama/Llama-3.1-{8B,70B}-Instruct 💻| Prompt|
-|meta-llama/Llama-3.2-{1B,3B}-Instruct-FC 💻| Function Calling|
 |meta-llama/Llama-3.2-{1B,3B}-Instruct 💻| Prompt|
 |open-mixtral-{8x7b,8x22b} | Prompt|
 |open-mixtral-8x22b-FC | Function Calling|
@@ -265,7 +264,6 @@ In the following two sections, the optional `--test-category` parameter can be u
   - `multi_turn_miss_func`: Multi-turn function calls with missing function.
   - `multi_turn_miss_param`: Multi-turn function calls with missing parameter.
   - `multi_turn_long_context`: Multi-turn function calls with long context.
-  - `multi_turn_composite`: Multi-turn function calls with missing function, missing parameter, and long context.
 - If no test category is provided, the script will run all available test categories. (same as `all`)
 
 > If you want to run the `all`, `non_live`, `executable` or `python` category, make sure to register your REST API keys in the `.env` file. This is because Gorilla Openfunctions Leaderboard wants to test model's generated output on real world API!

diff --git a/berkeley-function-call-leaderboard/bfcl/constant.py b/berkeley-function-call-leaderboard/bfcl/constant.py
@@ -66,14 +66,12 @@
         "multi_turn_miss_func",
         "multi_turn_miss_param",
         "multi_turn_long_context",
-        # "multi_turn_composite",  # Composite is currently not included in the leaderboard
     ],
     "multi_turn": [
         "multi_turn_base",
         "multi_turn_miss_func",
         "multi_turn_miss_param",
         "multi_turn_long_context",
-        # "multi_turn_composite",  # Composite is currently not included in the leaderboard
     ],
     "single_turn": [
         "exec_simple",

diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/constant.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/constant.py
@@ -58,7 +58,6 @@
     "Miss Func",
     "Miss Param",
     "Long Context",
-    "Composite",
 ]
 
 
@@ -91,7 +90,6 @@
     "Multi Turn Miss Func",
     "Multi Turn Miss Param",
     "Multi Turn Long Context",
-    "Multi Turn Composite",
     "Relevance Detection",
     "Irrelevance Detection",
     "Organization",

diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py
@@ -602,10 +602,10 @@ def runner(model_names, test_categories, api_sanity_check):
     )
 
     print(
-        f"🏁 Evaluation completed. See {SCORE_PATH / 'data_overall.csv'} for evaluation results on BFCL V3."
+        f"🏁 Evaluation completed. See {SCORE_PATH / 'data_overall.csv'} for overall evaluation results on BFCL V3."
     )
     print(
-        f"See {SCORE_PATH / 'data_live.csv'} and {SCORE_PATH / 'data_non_live.csv'} for evaluation results on BFCL V3 Live and Non-Live categories respectively."
+        f"See {SCORE_PATH / 'data_live.csv'}, {SCORE_PATH / 'data_non_live.csv'} and {SCORE_PATH / 'data_multi_turn.csv'} for detailed evaluation results on each sub-section categories respectively."
     )
 
 

diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py
@@ -412,16 +412,12 @@ def generate_leaderboard_csv(
         multi_turn_long_context = value.get(
             "multi_turn_long_context", {"accuracy": 0, "total_count": 0}
         )
-        multi_turn_composite = value.get(
-            "multi_turn_composite", {"accuracy": 0, "total_count": 0}
-        )
         overall_accuracy_multi_turn = calculate_unweighted_accuracy(
             [
                 multi_turn_base,
                 multi_turn_miss_func,
                 multi_turn_miss_param,
                 multi_turn_long_context,
-                # multi_turn_composite,  # Composite is currently not included in the leaderboard, because it takes too long to evaluate
             ]
         )
 
@@ -434,7 +430,6 @@ def generate_leaderboard_csv(
                 multi_turn_miss_func["accuracy"],
                 multi_turn_miss_param["accuracy"],
                 multi_turn_long_context["accuracy"],
-                # multi_turn_composite["accuracy"],
             ]
         )
 
@@ -481,8 +476,6 @@ def generate_leaderboard_csv(
                 multi_turn_miss_func["accuracy"],
                 multi_turn_miss_param["accuracy"],
                 multi_turn_long_context["accuracy"],
-                'N/A',  # No composite score for now
-                # multi_turn_composite["accuracy"],
                 total_relevance["accuracy"],
                 total_irrelevance["accuracy"],
                 model_metadata[model_name_escaped][2],
@@ -524,6 +517,23 @@ def generate_leaderboard_csv(
             else:
                 f.write(",".join(row))
 
+    # Write Multi Turn Score File
+    data_multi_turn.sort(key=lambda x: x[2], reverse=True)
+    for i in range(len(data_multi_turn)):
+        data_multi_turn[i][0] = str(i + 1)
+        for j in range(2, len(data_multi_turn[i])):
+            data_multi_turn[i][j] = "{:.2f}%".format(data_multi_turn[i][j] * 100)
+
+    data_multi_turn.insert(0, COLUMNS_MULTI_TURN)
+
+    filepath = output_path / "data_multi_turn.csv"
+    with open(filepath, "w") as f:
+        for i, row in enumerate(data_multi_turn):
+            if i < len(data_multi_turn) - 1:
+                f.write(",".join(row) + "\n")
+            else:
+                f.write(",".join(row))
+
     # Write Total Score File
     data_combined.sort(key=lambda x: x[1], reverse=True)
     for i in range(len(data_combined)):
@@ -532,8 +542,7 @@ def generate_leaderboard_csv(
         for j in range(4, 8):
             data_combined[i][j] = str(data_combined[i][j])
         for j in range(8, len(data_combined[i]) - 2):
-            # TODO: Remove this after composite is added
-            data_combined[i][j] = "{:.2f}%".format(data_combined[i][j] * 100) if data_combined[i][j] != 'N/A' else 'N/A'
+            data_combined[i][j] = "{:.2f}%".format(data_combined[i][j] * 100)
         for j in range(len(data_combined[i]) - 2, len(data_combined[i])):
             data_combined[i][j] = str(data_combined[i][j])
 

diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/model_metadata.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/model_metadata.py
@@ -451,18 +451,6 @@
         "Meta",
         "Meta Llama 3 Community",
     ],
-    "meta-llama/Llama-3.2-1B-Instruct-FC": [
-        "Llama-3.2-1B-Instruct (FC)",
-        "https://llama.meta.com/llama3",
-        "Meta",
-        "Meta Llama 3 Community",
-    ],
-    "meta-llama/Llama-3.2-3B-Instruct-FC": [
-        "Llama-3.2-3B-Instruct (FC)",
-        "https://llama.meta.com/llama3",
-        "Meta",
-        "Meta Llama 3 Community",
-    ],
     "command-r-plus-FC": [
         "Command-R-Plus (FC) (Original)",
         "https://txt.cohere.com/command-r-plus-microsoft-azure",

diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/constant.py b/berkeley-function-call-leaderboard/bfcl/model_handler/constant.py
@@ -2,12 +2,12 @@
 
 DEFAULT_SYSTEM_PROMPT_WITHOUT_FUNC_DOC = """You are an expert in composing functions. You are given a question and a set of possible functions. Based on the question, you will need to make one or more function/tool calls to achieve the purpose.
 If none of the function can be used, point it out. If the given question lacks the parameters required by the function, also point it out.
-You should only return the function call in tools call sections.
+You should only return the function calls in your response.
 
 If you decide to invoke any of the function(s), you MUST put it in the format of [func_name1(params_name1=params_value1, params_name2=params_value2...), func_name2(params)]
 You SHOULD NOT include any other text in the response.
 
-At each turn, your should try your best to complete the tasks requested by the user within the current turn. Continue outputting functions to call until you have fulfilled the user's request to the best of your ability. Once you have no more functions to call, the system will consider the current turn complete and proceed to the next turn or task.
+At each turn, your should try your best to complete the tasks requested by the user within the current turn. Continue to output functions to call until you have fulfilled the user's request to the best of your ability. Once you have no more functions to call, the system will consider the current turn complete and proceed to the next turn or task.
 """
 
 DEFAULT_SYSTEM_PROMPT = (

diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/handler_map.py b/berkeley-function-call-leaderboard/bfcl/model_handler/handler_map.py
@@ -92,10 +92,8 @@
     "meta-llama/Meta-Llama-3-8B-Instruct": LlamaHandler,
     "meta-llama/Meta-Llama-3-70B-Instruct": LlamaHandler,
     "meta-llama/Llama-3.1-8B-Instruct-FC": LlamaFCHandler,
-    "meta-llama/Llama-3.1-70B-Instruct-FC": LlamaFCHandler,
-    "meta-llama/Llama-3.2-1B-Instruct-FC": LlamaFCHandler,
-    "meta-llama/Llama-3.2-3B-Instruct-FC": LlamaFCHandler,
     "meta-llama/Llama-3.1-8B-Instruct": LlamaHandler,
+    "meta-llama/Llama-3.1-70B-Instruct-FC": LlamaFCHandler,
     "meta-llama/Llama-3.1-70B-Instruct": LlamaHandler,
     "meta-llama/Llama-3.2-1B-Instruct": LlamaHandler,
     "meta-llama/Llama-3.2-3B-Instruct": LlamaHandler,