Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Evaluating several spanish embedding models #63

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
34 changes: 34 additions & 0 deletions evaluation/embeddings_model/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
````
python mteb_benchmark.py
````


| Benchmark | dariolopez__roberta-base-bne-finetuned-msmarco-qa-es-mnrl-mn | intfloat__multilingual-e5-large | BAAI/bge-m3 |
|-----------------------------|---------|----------|----------|
| AmazonReviewsClassification | 0.28194 | 0.42702 | **0.44678000000000007**
| CataloniaTweetClassification | 0.48369999999999996 | **0.5025000000000001** | 0.4875999999999999
| MassiveIntentClassification | 0.5473100201748486 | 0.6470073974445192 | **0.6703429724277068**
| MassiveScenarioClassification | 0.6322797579018158 | 0.689340954942838 | **0.721990585070612**
| MintakaRetrieval | 0.16548 | **0.2836** | 0.22341
| MIRACLRetrieval | 0.70137 | 0.82005 | -
| MLSUMClusteringS2S.v2 | 0.42841628413893035 | 0.48075917245775485 | -
| MTOPDomainClassification | 0.7924616410940628 | 0.8998999332888593 | **0.9183789192795198**
| MTOPIntentClassification | 0.5307538358905937 | **0.6673782521681121** | 0.6644096064042696
| MultiEURLEXMultilabelClassification | 0.05144 | 0.05226000000000001 | -
| MultiHateClassification | 0.5578 | **0.639** | 0.6253
| PawsX | 0.6015684593563027 | 0.5639685167829116 | **0.5735093608198505**
| PublicHealthQA | 0.62516 | **0.80811** | 0.79986
| SIB200Classification | 0.6549019607843137 | **0.7348039215686275** | 0.7313725490196079
| SIB200ClusteringS2S | 0.3347573603718645 | **0.3637865013678009** | 0.34998316595531576
| SpanishNewsClassification | 0.81318359375 | 0.880517578125 | -
| SpanishNewsClusteringP2P | 0.379918321557151 | 0.4399933663826367 | -
| SpanishSentimentClassification | 0.6378378378378378 | 0.9141891891891893 | **0.9395270270270271**
| STS17 | 0.23167578806693545 | **0.8092850520982419** | 0.7557298844031564
| STS22 | 0.49970798735740846 | 0.7865922376187726 |
| STSBenchmarkMultilingualSTS | 0.7724973718736371 | **0.8646354604520479** | 0.8468700424822017
| STSES | 0.6040795444089487 | **0.7923804835012699** | 0.7743978294342545
| TweetSentimentClassification | 0.408203125 | 0.508984375 | 0.54765625
| XMarket | 0.11391 | 0.14136 | -
| XNLI | 0.5793703625227221 | 0.7603625574106656 | **0.8076605260070395**
| XPQARetrieval | 0.47322 | 0.61619 | **0.62131**
| XQuADRetrieval | 0.81996 | **0.97644** | 0.96649
113 changes: 113 additions & 0 deletions evaluation/embeddings_model/mteb_benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
import mteb
from sentence_transformers import SentenceTransformer


# https://github.com/embeddings-benchmark/mteb


# TODO: write results on model cards huggingface
# Define the sentence-transformers model name
# model_name = "dariolopez/roberta-base-bne-finetuned-msmarco-qa-es-mnrl-mn"
# model_name = "dariolopez/roberta-base-bne-finetuned-msmarco-qa-es"
# model_name = "PlanTL-GOB-ES/roberta-base-bne"
# model_name = "PlanTL-GOB-ES/RoBERTalex"

# model_name = "hiiamsid/sentence_similarity_spanish_es"
# model_name = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
# model_name = "intfloat/multilingual-e5-small"
# model_name = "intfloat/multilingual-e5-base"
# model_name = "intfloat/multilingual-e5-large"
# model_name = "intfloat/multilingual-e5-large-instruct"
model_name = "BAAI/bge-m3"

try:
model = SentenceTransformer(model_name, device='cuda')
print("Loaded model embedding using GPU")
except:
model = SentenceTransformer(model_name, device='cpu')
print("Loaded model embedding using CPU")


TASK_LIST_BITEXT_MINING = [
"BibleNLPBitextMining",
# "FloresBitextMining", s2s, crosslingual 406 / 41412 pairs
# "NTREXBitextMining", s2s, crosslingual 62 / 1916 pairs
"Tatoeba",
]

TASK_LIST_PAIR_CLASSIFICATION = [
"PawsX",
"XNLI"
]

TASK_LIST_MULTI_LABEL_CLASSIFICATION = [
## "MultiEURLEXMultilabelClassification"
]

TASK_LIST_RETRIEVAL = [
# "BelebeleRetrieval",
"MintakaRetrieval",
## "MIRACLRetrieval",
# "MLQARetrieval",
# "MultiLongDocRetrieval",
"PublicHealthQA",
# "XMarket",
"XPQARetrieval",
"XQuADRetrieval",
## "SpanishPassageRetrievalS2P",
"SpanishPassageRetrievalS2S"
]

TASK_LIST_CLASSIFICATION = [
"AmazonReviewsClassification",
"CataloniaTweetClassification",
# "LanguageClassification",
"MassiveIntentClassification",
"MassiveScenarioClassification",
"MTOPDomainClassification",
"MTOPIntentClassification",
"MultiHateClassification",
# "MultilingualSentimentClassification",
"SIB200Classification",
"TweetSentimentClassification",
## "SpanishNewsClassification",
"SpanishSentimentClassification"
]

TASK_LIST_CLUSTERING = [
# "MLSUMClusteringP2P.v2",
## "SpanishNewsClusteringP2P",
## "MLSUMClusteringS2S.v2",
"SIB200ClusteringS2S"
]

TASK_LIST_RERANKING = [
# "MIRACLReranking"
]

TASK_LIST_STS = [
"STS17",
## "STS22",
"STSBenchmarkMultilingualSTS",
"STSES"
]

TASK_LIST = (
TASK_LIST_BITEXT_MINING
+ TASK_LIST_PAIR_CLASSIFICATION
+ TASK_LIST_MULTI_LABEL_CLASSIFICATION
+ TASK_LIST_RETRIEVAL
+ TASK_LIST_CLASSIFICATION
+ TASK_LIST_CLUSTERING
+ TASK_LIST_RERANKING
+ TASK_LIST_STS
)


tasks = mteb.get_tasks(languages=["spa"]) # Spanish
print(tasks)
print(TASK_LIST)
tasks = mteb.get_tasks(tasks=TASK_LIST, languages=["spa"]) # Spanish filtered
evaluation = mteb.MTEB(tasks=tasks)
# evaluation = mteb.MTEB(tasks=tasks, task_langs=["es"])
results = evaluation.run(model, output_folder=f"results/{model_name}")
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
{
"dataset_revision": "1399c76144fd37290681b995c656ef9b2e06e26d",
"evaluation_time": 89.11965489387512,
"kg_co2_emissions": null,
"mteb_version": "1.12.39",
"scores": {
"test": [
{
"accuracy": 0.44678000000000007,
"f1": 0.42405774139116525,
"f1_weighted": 0.42405774139116525,
"hf_subset": "es",
"languages": [
"spa-Latn"
],
"main_score": 0.44678000000000007,
"scores_per_experiment": [
{
"accuracy": 0.4432,
"f1": 0.4128581086837859,
"f1_weighted": 0.4128581086837859
},
{
"accuracy": 0.4536,
"f1": 0.4362083315439791,
"f1_weighted": 0.43620833154397903
},
{
"accuracy": 0.468,
"f1": 0.4297143374330804,
"f1_weighted": 0.42971433743308046
},
{
"accuracy": 0.4484,
"f1": 0.44175168895312933,
"f1_weighted": 0.4417516889531293
},
{
"accuracy": 0.437,
"f1": 0.40908631376427473,
"f1_weighted": 0.4090863137642748
},
{
"accuracy": 0.4506,
"f1": 0.42870402357493365,
"f1_weighted": 0.42870402357493365
},
{
"accuracy": 0.4334,
"f1": 0.41082859750405804,
"f1_weighted": 0.41082859750405804
},
{
"accuracy": 0.4514,
"f1": 0.4396235588515759,
"f1_weighted": 0.43962355885157595
},
{
"accuracy": 0.4492,
"f1": 0.41488193691105657,
"f1_weighted": 0.4148819369110565
},
{
"accuracy": 0.433,
"f1": 0.41692051669177876,
"f1_weighted": 0.4169205166917787
}
]
}
],
"validation": [
{
"accuracy": 0.44446,
"f1": 0.42214102696443695,
"f1_weighted": 0.42214102696443695,
"hf_subset": "es",
"languages": [
"spa-Latn"
],
"main_score": 0.44446,
"scores_per_experiment": [
{
"accuracy": 0.4336,
"f1": 0.40320353806833475,
"f1_weighted": 0.4032035380683348
},
{
"accuracy": 0.4488,
"f1": 0.4305457057707863,
"f1_weighted": 0.4305457057707864
},
{
"accuracy": 0.464,
"f1": 0.42430424103687503,
"f1_weighted": 0.42430424103687503
},
{
"accuracy": 0.4442,
"f1": 0.4381740531500947,
"f1_weighted": 0.4381740531500947
},
{
"accuracy": 0.4406,
"f1": 0.4156932047502909,
"f1_weighted": 0.415693204750291
},
{
"accuracy": 0.4572,
"f1": 0.4340821005328042,
"f1_weighted": 0.43408210053280416
},
{
"accuracy": 0.437,
"f1": 0.4166405917478004,
"f1_weighted": 0.4166405917478004
},
{
"accuracy": 0.4414,
"f1": 0.43121042189248604,
"f1_weighted": 0.431210421892486
},
{
"accuracy": 0.4458,
"f1": 0.41170102764187205,
"f1_weighted": 0.41170102764187205
},
{
"accuracy": 0.432,
"f1": 0.41585538505302555,
"f1_weighted": 0.4158553850530255
}
]
}
]
},
"task_name": "AmazonReviewsClassification"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
{
"dataset_revision": "264a18480c529d9e922483839b4b9758e690b762",
"evaluation_time": 9.615000247955322,
"kg_co2_emissions": null,
"mteb_version": "1.12.39",
"scores": {
"train": [
{
"accuracy": 0.98828125,
"f1": 0.984375,
"hf_subset": "eng_Latn-spa_Latn",
"languages": [
"eng-Latn",
"spa-Latn"
],
"main_score": 0.984375,
"precision": 0.982421875,
"recall": 0.98828125
},
{
"accuracy": 0.9921875,
"f1": 0.9895833333333333,
"hf_subset": "spa_Latn-eng_Latn",
"languages": [
"spa-Latn",
"eng-Latn"
],
"main_score": 0.9895833333333333,
"precision": 0.98828125,
"recall": 0.9921875
}
]
},
"task_name": "BibleNLPBitextMining"
}
Loading
Loading