diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py index e3c7d0aad..cb806e467 100644 --- a/mteb/leaderboard/app.py +++ b/mteb/leaderboard/app.py @@ -412,7 +412,7 @@ def filter_models( compatibility, instructions, model_size, - zero_shot, + zero_shot_setting, ): lower, upper = model_size # Setting to None, when the user doesn't specify anything @@ -432,12 +432,12 @@ def filter_models( tasks = mteb.get_tasks(tasks=task_select) models_to_keep = set() for model_meta in model_metas: - is_zero_shot = model_meta.is_zero_shot_on(tasks) - if is_zero_shot is None: - if zero_shot == "hard": + is_model_zero_shot = model_meta.is_zero_shot_on(tasks) + if is_model_zero_shot is None: + if zero_shot_setting == "hard": continue - if not zero_shot: - if zero_shot != "off": + elif not is_model_zero_shot: + if zero_shot_setting != "off": continue models_to_keep.add(model_meta.name) return list(models_to_keep) @@ -460,7 +460,7 @@ def update_models( compatibility, instructions, model_size, - zero_shot, + zero_shot_setting=zero_shot, ) elapsed = time.time() - start_time logger.info(f"update_models callback: {elapsed}s") diff --git a/mteb/leaderboard/table.py b/mteb/leaderboard/table.py index 9a1dc5799..041df4709 100644 --- a/mteb/leaderboard/table.py +++ b/mteb/leaderboard/table.py @@ -88,7 +88,7 @@ def get_means_per_types(per_task: pd.DataFrame): dict( model_name=model_name, task_type=task_type, - score=scores[tasks].mean(), + score=scores[tasks].mean(skipna=False), ) ) return pd.DataFrame.from_records(records) diff --git a/mteb/load_results/benchmark_results.py b/mteb/load_results/benchmark_results.py index 015a96d33..e1632a3de 100644 --- a/mteb/load_results/benchmark_results.py +++ b/mteb/load_results/benchmark_results.py @@ -260,8 +260,16 @@ def parse_version(version_str: str) -> Version | None: def keep_best(group: pd.DataFrame) -> pd.DataFrame: is_main_revision = group["revision"] == group["main_revision"] - if is_main_revision.sum() == 1: - return group[is_main_revision] + # If the main revision is present we select that + if is_main_revision.sum() > 0: + return group[is_main_revision].head(n=1) + unique_revisions = group["revision"].unique() + # Filtering out no_revision_available if other revisions are present + if (len(unique_revisions) > 1) and ( + "no_revision_available" in unique_revisions + ): + group = group[group["revision"] != "no_revision_available"] + # If there are any not-NA mteb versions, we select the latest one if group["mteb_version"].notna().any(): group = group.dropna(subset=["mteb_version"]) group = group.sort_values("mteb_version", ascending=False)