From 3f5ee82a5049eaf235a84fcfc9278f48adecfcb7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Wed, 15 Jan 2025 10:43:16 +0100 Subject: [PATCH] fix: Added `ModelMeta` for BGE, GTE Chinese and multilingual models (#1811) * Added BGE Chinese and multilingual-gemma models * Added GTE multilingual and Chinese models * Fixed date format --- mteb/models/bge_models.py | 195 ++++++++++++++++++++++++++++++++++++++ mteb/models/gte_models.py | 195 +++++++++++++++++++++++++++++++++++++- 2 files changed, 389 insertions(+), 1 deletion(-) diff --git a/mteb/models/bge_models.py b/mteb/models/bge_models.py index 276d28526f..23851b4988 100644 --- a/mteb/models/bge_models.py +++ b/mteb/models/bge_models.py @@ -5,6 +5,89 @@ from mteb.model_meta import ModelMeta, sentence_transformers_loader model_prompts = {"query": "Represent this sentence for searching relevant passages: "} +model_prompts_zh = {"query": "为这个句子生成表示以用于检索相关文章:"} + +bge_m_training_data = { + # source: https://arxiv.org/pdf/2402.03216 + "MIRACLRetrieval": ["train"], + "MIRACLRetrievalHardNegatives": ["train"], + "MIRACLReranking": ["train"], + "LeCaRDv2": ["train"], + "CMedQAv1-reranking": ["train"], + "CMedQAv2-reranking": ["train"], + "MrTidyRetrieval": ["train"], + "T2Reranking": ["train"], + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + "HotpotQA": ["train"], + "HotpotQA-PL": ["train"], # translation not trained on + "HotpotQAHardNegatives": ["train"], + # + synthetic data +} + +bge_training_data = { + # source: https://data.baai.ac.cn/details/BAAI-MTP + "NQ": ["test"], + "NQHardNegatives": ["test"], + "AmazonReviewsClassification": [ + "validation", + "test", + ], # assumed from: amazon_reviews_multi + "MLQARetrieval": [ + "validation", + "test", + ], # assumed from mlqa (question, context) + # not in mteb + # Dataset Pairs + # wudao (title, passage) + # cmrc2018 (query, context) + # dureader (query, context) + # simclue (sentence_a, sentence_b) + # csl (title, abstract) + # amazon_reviews_multi (title, body) + # wiki_atomic_edits (base_sentence, edited_sentence) + # mlqa (question, context) + # xlsum (title, summary) (title, text) + # "sentence-transformers data": [], # https://huggingface.co/datasets/sentence-transformers/embedding-training-data # TODO check this further + # "wikipedia": [], # title + section title, passage + # "reddit": [], # title, body + # "stackexchange": [], # (title, upvoted answer) (title+body, upvoted answer) + # "s2orc": [], # (title, abstract) (title, citation title) (abstract, citation abstract) +} + +bge_chinese_training_data = { + # source: https://arxiv.org/pdf/2309.07597 + "T2Retrieval": ["train"], + "DuReader": ["train"], + "MMarcoReranking": ["train"], + "CMedQAv2-reranking": ["train"], + "Cmnli": ["train"], + "Ocnli": ["train"], + # not in mteb + # - multi-cpr + # - NLI-zh + # Dataset Pairs + # wudao (title, passage) + # cmrc2018 (query, context) + # dureader (query, context) + # simclue (sentence_a, sentence_b) + # csl (title, abstract) + # amazon_reviews_multi (title, body) + # wiki_atomic_edits (base_sentence, edited_sentence) + # mlqa (question, context) + # xlsum (title, summary) (title, text) + # "sentence-transformers data": [], # https://huggingface.co/datasets/sentence-transformers/embedding-training-data # TODO check this further + # "wikipedia": [], # title + section title, passage + # "reddit": [], # title, body + # "stackexchange": [], # (title, upvoted answer) (title+body, upvoted answer) + # "s2orc": [], # (title, abstract) (title, citation title) (abstract, citation abstract) +} bge_small_en_v1_5 = ModelMeta( loader=partial( # type: ignore @@ -167,3 +250,115 @@ # "s2orc": [], # (title, abstract) (title, citation title) (abstract, citation abstract) }, ) + +bge_small_zh_v1_5 = ModelMeta( + loader=partial( # type: ignore + sentence_transformers_loader, + model_name="BAAI/bge-small-zh-v1.5", + revision="7999e1d3359715c523056ef9478215996d62a620", + model_prompts=model_prompts_zh, + ), + name="BAAI/bge-small-zh-v1.5", + languages=["zho_Hans"], + open_weights=True, + revision="7999e1d3359715c523056ef9478215996d62a620", + release_date="2023-09-12", # initial commit of hf model. + n_parameters=24_000_000, + memory_usage=None, + embed_dim=512, + license="mit", + max_tokens=512, + reference="https://huggingface.co/BAAI/bge-small-zh-v1.5", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=True, + public_training_data=True, # https://data.baai.ac.cn/details/BAAI-MTP + public_training_code=None, # seemingly released (at least for some models, but the link is broken + training_datasets=bge_chinese_training_data, +) + +bge_base_zh_v1_5 = ModelMeta( + loader=partial( # type: ignore + sentence_transformers_loader, + model_name="BAAI/bge-base-zh-v1.5", + revision="f03589ceff5aac7111bd60cfc7d497ca17ecac65", + model_prompts=model_prompts_zh, + ), + name="BAAI/bge-base-zh-v1.5", + languages=["zho_Hans"], + open_weights=True, + revision="f03589ceff5aac7111bd60cfc7d497ca17ecac65", + release_date="2023-09-11", # initial commit of hf model. + n_parameters=438_000_000, + memory_usage=None, + embed_dim=768, + license="mit", + max_tokens=512, + reference="https://huggingface.co/BAAI/bge-base-zh-v1.5", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=True, + public_training_data=True, # https://data.baai.ac.cn/details/BAAI-MTP + public_training_code=None, # seemingly released (at least for some models, but the link is broken + training_datasets=bge_chinese_training_data, +) + +bge_large_zh_v1_5 = ModelMeta( + loader=partial( # type: ignore + sentence_transformers_loader, + model_name="BAAI/bge-large-zh-v1.5", + revision="79e7739b6ab944e86d6171e44d24c997fc1e0116", + model_prompts=model_prompts_zh, + ), + name="BAAI/bge-large-zh-v1.5", + languages=["zho_Hans"], + open_weights=True, + revision="79e7739b6ab944e86d6171e44d24c997fc1e0116", + release_date="2023-09-12", # initial commit of hf model. + n_parameters=1_340_000_000, + memory_usage=None, + embed_dim=1024, + license="mit", + max_tokens=512, + reference="https://huggingface.co/BAAI/bge-large-zh-v1.5", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=True, + public_training_data=True, # https://data.baai.ac.cn/details/BAAI-MTP + public_training_code=None, # seemingly released (at least for some models, but the link is broken + training_datasets=bge_chinese_training_data, +) + +bge_multilingual_gemma2 = ModelMeta( + loader=partial( # type: ignore + sentence_transformers_loader, + model_name="BAAI/bge-multilingual-gemma2", + revision="992e13d8984fde2c31ef8a3cb2c038aeec513b8a", + ), + name="BAAI/bge-multilingual-gemma2", + languages=[ + "eng_Latn", + "zho_Hans", + "kor_Hang", + "kor_Latn", + "fra_Latn", + "jpn_Jpan", + "jpn_Latn", + ], # This list is incomlete. Their description says "and more". + # I'm also unsure about the scripts. + open_weights=True, + revision="992e13d8984fde2c31ef8a3cb2c038aeec513b8a", + release_date="2024-07-25", # initial commit of hf model. + n_parameters=9.24 * 1e9, + memory_usage=None, + embed_dim=3584, # from old C-MTEB leaderboard + license="gemma", + max_tokens=8192, # from old C-MTEB leaderboard + reference="https://huggingface.co/BAAI/bge-multilingual-gemma2", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + public_training_data=False, + public_training_code=False, + training_datasets=None, # not disclosed +) diff --git a/mteb/models/gte_models.py b/mteb/models/gte_models.py index 648fc18850..f800aaa941 100644 --- a/mteb/models/gte_models.py +++ b/mteb/models/gte_models.py @@ -5,7 +5,7 @@ import torch from mteb.encoder_interface import PromptType -from mteb.model_meta import ModelMeta +from mteb.model_meta import ModelMeta, sentence_transformers_loader from mteb.models.instruct_wrapper import instruct_wrapper @@ -105,3 +105,196 @@ def instruction_template( framework=["Sentence Transformers", "PyTorch"], use_instructions=True, ) + +gte_small_zh = ModelMeta( + loader=partial( # type: ignore + sentence_transformers_loader, + model_name="thenlper/gte-small-zh", + revision="af7bd46fbb00b3a6963c8dd7f1786ddfbfbe973a", + ), + name="thenlper/gte-small-zh", + languages=["zho_Hans"], + open_weights=True, + revision="af7bd46fbb00b3a6963c8dd7f1786ddfbfbe973a", + release_date="2023-11-08", # initial commit of hf model. + n_parameters=30.3 * 1e6, + memory_usage=None, + embed_dim=1024, + license="mit", + max_tokens=512, + reference="https://huggingface.co/thenlper/gte-small-zh", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + public_training_data=False, + public_training_code=None, + training_datasets=None, # Not disclosed +) + +gte_base_zh = ModelMeta( + loader=partial( # type: ignore + sentence_transformers_loader, + model_name="thenlper/gte-base-zh", + revision="71ab7947d6fac5b64aa299e6e40e6c2b2e85976c", + ), + name="thenlper/gte-base-zh", + languages=["zho_Hans"], + open_weights=True, + revision="71ab7947d6fac5b64aa299e6e40e6c2b2e85976c", + release_date="2023-11-08", # initial commit of hf model. + n_parameters=102 * 1e6, + memory_usage=None, + embed_dim=1024, + license="mit", + max_tokens=512, + reference="https://huggingface.co/thenlper/gte-base-zh", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + public_training_data=False, + public_training_code=None, + training_datasets=None, # Not disclosed +) + +gte_large_zh = ModelMeta( + loader=partial( # type: ignore + sentence_transformers_loader, + model_name="thenlper/gte-large-zh", + revision="64c364e579de308104a9b2c170ca009502f4f545", + ), + name="thenlper/gte-large-zh", + languages=["zho_Hans"], + open_weights=True, + revision="64c364e579de308104a9b2c170ca009502f4f545", + release_date="2023-11-08", # initial commit of hf model. + n_parameters=326 * 1e6, + memory_usage=None, + embed_dim=1024, + license="mit", + max_tokens=512, + reference="https://huggingface.co/thenlper/gte-large-zh", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + public_training_data=False, + public_training_code=None, + training_datasets=None, # Not disclosed +) + +gte_multilingual_langs = [ + "afr_Latn", + "ara_Arab", + "aze_Latn", + "bel_Cyrl", + "bul_Cyrl", + "ben_Beng", + "cat_Latn", + "ceb_Latn", + "ces_Latn", + "cym_Latn", + "dan_Latn", + "deu_Latn", + "ell_Grek", + "eng_Latn", + "spa_Latn", + "est_Latn", + "eus_Latn", + "fas_Arab", + "fin_Latn", + "fra_Latn", + "glg_Latn", + "guj_Gujr", + "heb_Hebr", + "hin_Deva", + "hrv_Latn", + "hat_Latn", + "hun_Latn", + "hye_Armn", + "ind_Latn", + "isl_Latn", + "ita_Latn", + "jpn_Jpan", + "jav_Latn", + "kat_Geor", + "kaz_Cyrl", + "khm_Khmr", + "kan_Knda", + "kor_Hang", + "kir_Cyrl", + "lao_Laoo", + "lit_Latn", + "lav_Latn", + "mkd_Cyrl", + "mal_Mlym", + "mon_Cyrl", + "mar_Deva", + "msa_Latn", + "mya_Mymr", + "nep_Deva", + "nld_Latn", + "nor_Latn", + "pan_Guru", + "pol_Latn", + "por_Latn", + "que_Latn", + "ron_Latn", + "rus_Cyrl", + "sin_Sinh", + "slk_Latn", + "slv_Latn", + "swa_Latn", + "tam_Taml", + "tel_Telu", + "tha_Thai", + "tgl_Latn", + "tur_Latn", + "ukr_Cyrl", + "urd_Arab", + "vie_Latn", + "yor_Latn", + "zho_Hans", +] +# Source: https://arxiv.org/pdf/2407.19669 +gte_multi_training_data = { + "T2Retrieval": ["train"], + "DuReader": ["train"], + "MMarcoReranking": ["train"], + "CMedQAv2-reranking": ["train"], + "NQ": ["train"], + "MSMARCO": ["train"], + "HotpotQA": ["train"], + "FEVER": ["train"], + "MIRACLReranking": ["train"], + "MrTidyRetrieval": ["train"], + "MultiLongDocRetrieval": ["train"], + # not in MTEB: + # - TriviaQA + # - SQuAD + # - AllNLI + # - Multi-CPR +} + +gte_multilingual_base = ModelMeta( + loader=partial( # type: ignore + sentence_transformers_loader, + model_name="Alibaba-NLP/gte-multilingual-base", + revision="ca1791e0bcc104f6db161f27de1340241b13c5a4", + ), + name="Alibaba-NLP/gte-multilingual-base", + languages=gte_multilingual_langs, + open_weights=True, + revision="ca1791e0bcc104f6db161f27de1340241b13c5a4", + release_date="2024-07-20", # initial commit of hf model. + n_parameters=305 * 1e6, + memory_usage=None, + embed_dim=1024, + license="apache-2", + max_tokens=8192, + reference="https://huggingface.co/Alibaba-NLP/gte-multilingual-base", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + public_training_data=True, + public_training_code=None, # couldn't find + training_datasets=gte_multi_training_data, +)