From 4fb46eb5fe550b462010c4672a49b5837f9165f0 Mon Sep 17 00:00:00 2001 From: JINO-ROHIT Date: Tue, 12 Nov 2024 00:03:14 +0530 Subject: [PATCH 1/6] add MCC to BinaryClassificationEvaluator --- .../evaluation/BinaryClassificationEvaluator.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/sentence_transformers/evaluation/BinaryClassificationEvaluator.py b/sentence_transformers/evaluation/BinaryClassificationEvaluator.py index fdf76d6f0..757741efb 100644 --- a/sentence_transformers/evaluation/BinaryClassificationEvaluator.py +++ b/sentence_transformers/evaluation/BinaryClassificationEvaluator.py @@ -7,7 +7,7 @@ from typing import TYPE_CHECKING, Literal import numpy as np -from sklearn.metrics import average_precision_score +from sklearn.metrics import average_precision_score, matthews_corrcoef from sklearn.metrics.pairwise import paired_cosine_distances, paired_euclidean_distances, paired_manhattan_distances from sentence_transformers.evaluation.SentenceEvaluator import SentenceEvaluator @@ -124,6 +124,7 @@ def _append_csv_headers(self, similarity_fn_names: list[str]) -> None: "recall", "f1_threshold", "ap", + "mcc", ] for v in similarity_fn_names: @@ -275,11 +276,15 @@ def compute_metrices(self, model: SentenceTransformer) -> dict[str, dict[str, fl f1, precision, recall, f1_threshold = self.find_best_f1_and_threshold(scores, labels, greater_is_better) ap = average_precision_score(labels, scores * (1 if greater_is_better else -1)) + predicted_labels = (scores >= f1_threshold) if greater_is_better else (scores <= f1_threshold) + mcc = matthews_corrcoef(labels, predicted_labels) + logger.info(f"Accuracy with {name}: {acc * 100:.2f}\t(Threshold: {acc_threshold:.4f})") logger.info(f"F1 with {name}: {f1 * 100:.2f}\t(Threshold: {f1_threshold:.4f})") logger.info(f"Precision with {name}: {precision * 100:.2f}") logger.info(f"Recall with {name}: {recall * 100:.2f}") logger.info(f"Average Precision with {name}: {ap * 100:.2f}\n") + logger.info(f"Matthews Correlation with {name}: {mcc:.4f}\n") output_scores[similarity_fn_name] = { "accuracy": acc, @@ -289,6 +294,7 @@ def compute_metrices(self, model: SentenceTransformer) -> dict[str, dict[str, fl "precision": precision, "recall": recall, "ap": ap, + "mcc": mcc, } return output_scores From 0a660a2db9e1a3f1a5fe834381650aba154dbb62 Mon Sep 17 00:00:00 2001 From: JINO-ROHIT Date: Tue, 12 Nov 2024 09:42:46 +0530 Subject: [PATCH 2/6] updated docstring --- .../evaluation/BinaryClassificationEvaluator.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sentence_transformers/evaluation/BinaryClassificationEvaluator.py b/sentence_transformers/evaluation/BinaryClassificationEvaluator.py index 757741efb..da1b420dd 100644 --- a/sentence_transformers/evaluation/BinaryClassificationEvaluator.py +++ b/sentence_transformers/evaluation/BinaryClassificationEvaluator.py @@ -70,6 +70,7 @@ class BinaryClassificationEvaluator(SentenceEvaluator): Precision with Cosine-Similarity: 65.81 Recall with Cosine-Similarity: 87.89 Average Precision with Cosine-Similarity: 76.03 + Matthews correlation coefficient: 62.48 ''' print(binary_acc_evaluator.primary_metric) # => "quora_duplicates_dev_cosine_ap" From d46fefef435367b16920e5bcb60f03eeb2ecf91e Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Wed, 20 Nov 2024 13:25:04 +0100 Subject: [PATCH 3/6] Only use the newline for the last metric --- .../evaluation/BinaryClassificationEvaluator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sentence_transformers/evaluation/BinaryClassificationEvaluator.py b/sentence_transformers/evaluation/BinaryClassificationEvaluator.py index da1b420dd..e08595444 100644 --- a/sentence_transformers/evaluation/BinaryClassificationEvaluator.py +++ b/sentence_transformers/evaluation/BinaryClassificationEvaluator.py @@ -284,7 +284,7 @@ def compute_metrices(self, model: SentenceTransformer) -> dict[str, dict[str, fl logger.info(f"F1 with {name}: {f1 * 100:.2f}\t(Threshold: {f1_threshold:.4f})") logger.info(f"Precision with {name}: {precision * 100:.2f}") logger.info(f"Recall with {name}: {recall * 100:.2f}") - logger.info(f"Average Precision with {name}: {ap * 100:.2f}\n") + logger.info(f"Average Precision with {name}: {ap * 100:.2f}") logger.info(f"Matthews Correlation with {name}: {mcc:.4f}\n") output_scores[similarity_fn_name] = { From 89e578f2934703ba3f256c0e38b221255fa03b86 Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Wed, 20 Nov 2024 13:29:19 +0100 Subject: [PATCH 4/6] Match formatting, i.e. *100 the MCC --- .../evaluation/BinaryClassificationEvaluator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sentence_transformers/evaluation/BinaryClassificationEvaluator.py b/sentence_transformers/evaluation/BinaryClassificationEvaluator.py index e08595444..f592cb999 100644 --- a/sentence_transformers/evaluation/BinaryClassificationEvaluator.py +++ b/sentence_transformers/evaluation/BinaryClassificationEvaluator.py @@ -285,7 +285,7 @@ def compute_metrices(self, model: SentenceTransformer) -> dict[str, dict[str, fl logger.info(f"Precision with {name}: {precision * 100:.2f}") logger.info(f"Recall with {name}: {recall * 100:.2f}") logger.info(f"Average Precision with {name}: {ap * 100:.2f}") - logger.info(f"Matthews Correlation with {name}: {mcc:.4f}\n") + logger.info(f"Matthews Correlation with {name}: {mcc * 100:.2f}\n") output_scores[similarity_fn_name] = { "accuracy": acc, From 4d02b9c8d8829d5dc785e690ff1fade8e55d1414 Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Wed, 20 Nov 2024 13:29:31 +0100 Subject: [PATCH 5/6] Update logging indentation --- .../evaluation/BinaryClassificationEvaluator.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sentence_transformers/evaluation/BinaryClassificationEvaluator.py b/sentence_transformers/evaluation/BinaryClassificationEvaluator.py index f592cb999..bf0f9c3cd 100644 --- a/sentence_transformers/evaluation/BinaryClassificationEvaluator.py +++ b/sentence_transformers/evaluation/BinaryClassificationEvaluator.py @@ -280,11 +280,11 @@ def compute_metrices(self, model: SentenceTransformer) -> dict[str, dict[str, fl predicted_labels = (scores >= f1_threshold) if greater_is_better else (scores <= f1_threshold) mcc = matthews_corrcoef(labels, predicted_labels) - logger.info(f"Accuracy with {name}: {acc * 100:.2f}\t(Threshold: {acc_threshold:.4f})") - logger.info(f"F1 with {name}: {f1 * 100:.2f}\t(Threshold: {f1_threshold:.4f})") - logger.info(f"Precision with {name}: {precision * 100:.2f}") - logger.info(f"Recall with {name}: {recall * 100:.2f}") - logger.info(f"Average Precision with {name}: {ap * 100:.2f}") + logger.info(f"Accuracy with {name}: {acc * 100:.2f}\t(Threshold: {acc_threshold:.4f})") + logger.info(f"F1 with {name}: {f1 * 100:.2f}\t(Threshold: {f1_threshold:.4f})") + logger.info(f"Precision with {name}: {precision * 100:.2f}") + logger.info(f"Recall with {name}: {recall * 100:.2f}") + logger.info(f"Average Precision with {name}: {ap * 100:.2f}") logger.info(f"Matthews Correlation with {name}: {mcc * 100:.2f}\n") output_scores[similarity_fn_name] = { From f04e02bad75f7393300ed5823126dc0c62700067 Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Wed, 20 Nov 2024 13:33:48 +0100 Subject: [PATCH 6/6] Match the logging outputs in the example --- .../evaluation/BinaryClassificationEvaluator.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sentence_transformers/evaluation/BinaryClassificationEvaluator.py b/sentence_transformers/evaluation/BinaryClassificationEvaluator.py index bf0f9c3cd..e0e00ebc9 100644 --- a/sentence_transformers/evaluation/BinaryClassificationEvaluator.py +++ b/sentence_transformers/evaluation/BinaryClassificationEvaluator.py @@ -64,13 +64,13 @@ class BinaryClassificationEvaluator(SentenceEvaluator): ) results = binary_acc_evaluator(model) ''' - Binary Accuracy Evaluation of the model on the quora-duplicates-dev dataset: - Accuracy with Cosine-Similarity: 81.60 (Threshold: 0.8352) - F1 with Cosine-Similarity: 75.27 (Threshold: 0.7715) - Precision with Cosine-Similarity: 65.81 - Recall with Cosine-Similarity: 87.89 - Average Precision with Cosine-Similarity: 76.03 - Matthews correlation coefficient: 62.48 + Binary Accuracy Evaluation of the model on the quora_duplicates_dev dataset: + Accuracy with Cosine-Similarity: 81.60 (Threshold: 0.8352) + F1 with Cosine-Similarity: 75.27 (Threshold: 0.7715) + Precision with Cosine-Similarity: 65.81 + Recall with Cosine-Similarity: 87.89 + Average Precision with Cosine-Similarity: 76.03 + Matthews Correlation with Cosine-Similarity: 62.48 ''' print(binary_acc_evaluator.primary_metric) # => "quora_duplicates_dev_cosine_ap"