From f070c8c96c543064a28ef58a5a5b29d6ed206798 Mon Sep 17 00:00:00 2001 From: Amit Moryossef Date: Sat, 10 Feb 2024 17:05:39 +0100 Subject: [PATCH 1/3] feat(evaluate): add signwriting evaluation metrics --- sockeye/checkpoint_decoder.py | 37 ++++++++++-------- sockeye/constants.py | 14 ++++--- sockeye/evaluate.py | 72 ++++++++++++++++++++++++++++++++++- 3 files changed, 101 insertions(+), 22 deletions(-) diff --git a/sockeye/checkpoint_decoder.py b/sockeye/checkpoint_decoder.py index 922682eb0..b3a4e819c 100644 --- a/sockeye/checkpoint_decoder.py +++ b/sockeye/checkpoint_decoder.py @@ -181,25 +181,32 @@ def decode_and_evaluate(self, output_name: Optional[str] = None) -> Dict[str, fl self.model.train(original_mode) # 2. Evaluate - - metrics = {C.BLEU: evaluate.raw_corpus_bleu(hypotheses=translations[0], - references=self.targets_sentences[0], + hypotheses = translations[0] + references = self.targets_sentences[0] + metrics = {C.BLEU: evaluate.raw_corpus_bleu(hypotheses=hypotheses, references=references, offset=evaluate.DEFAULT_OFFSET), - C.CHRF: evaluate.raw_corpus_chrf(hypotheses=translations[0], - references=self.targets_sentences[0]), - C.ROUGE1: evaluate.raw_corpus_rouge1(hypotheses=translations[0], - references=self.targets_sentences[0]), - C.ROUGE2: evaluate.raw_corpus_rouge2(hypotheses=translations[0], - references=self.targets_sentences[0]), - C.ROUGEL: evaluate.raw_corpus_rougel(hypotheses=translations[0], - references=self.targets_sentences[0]), - C.LENRATIO: evaluate.raw_corpus_length_ratio(hypotheses=translations[0], - references=self.targets_sentences[0]), - C.TER: evaluate.raw_corpus_ter(hypotheses=translations[0], - references=self.targets_sentences[0]), + C.CHRF: evaluate.raw_corpus_chrf(hypotheses=hypotheses, references=references), + C.ROUGE1: evaluate.raw_corpus_rouge1(hypotheses=hypotheses, references=references), + C.ROUGE2: evaluate.raw_corpus_rouge2(hypotheses=hypotheses, references=references), + C.ROUGEL: evaluate.raw_corpus_rougel(hypotheses=hypotheses, references=references), + C.LENRATIO: evaluate.raw_corpus_length_ratio(hypotheses=hypotheses, references=references), + C.TER: evaluate.raw_corpus_ter(hypotheses=hypotheses, references=references), C.AVG_TIME: avg_time, C.DECODING_TIME: trans_wall_time} + # Add SignWriting Evaluation Metrics if the module is available + try: + import signwriting_evaluation + metrics.update({ + C.SIGNWRITING_CLIP: evaluate.raw_corpus_signwriting_clip( + hypotheses_factors=translations, + references_factors=self.targets_sentences), + C.SIGNWRITING_SIMILARITY: evaluate.raw_corpus_signwriting_similarity( + hypotheses_factors=translations, + references_factors=self.targets_sentences)}) + except ModuleNotFoundError: + pass + if len(translations) > 1: # metrics for other target factors for i, _ in enumerate(translations[1:], 1): # only BLEU diff --git a/sockeye/constants.py b/sockeye/constants.py index 0729f2f92..0da5c21f7 100644 --- a/sockeye/constants.py +++ b/sockeye/constants.py @@ -261,15 +261,19 @@ ROUGEL = 'rougel' BOW_PERPLEXITY = 'bow-perplexity' TER = 'ter' +SIGNWRITING_CLIP = 'signwriting-clip' +SIGNWRITING_SIMILARITY = 'signwriting-similarity' LENRATIO = 'length-ratio-mse' AVG_TIME = "avg-sec-per-sent" DECODING_TIME = "decode-walltime" -METRICS = [PERPLEXITY, ACCURACY, LENRATIO_MSE, BLEU, CHRF, ROUGE1, BOW_PERPLEXITY, TER] +METRICS = [PERPLEXITY, ACCURACY, LENRATIO_MSE, BLEU, CHRF, ROUGE1, BOW_PERPLEXITY, TER, + SIGNWRITING_CLIP, SIGNWRITING_SIMILARITY] METRIC_MAXIMIZE = {ACCURACY: True, BLEU: True, CHRF: True, ROUGE1: True, PERPLEXITY: False, LENRATIO_MSE: False, - TER: False, BOW_PERPLEXITY: False} -METRIC_WORST = {ACCURACY: 0.0, BLEU: 0.0, CHRF: 0.0, ROUGE1: 0.0, PERPLEXITY: np.inf, BOW_PERPLEXITY: np.inf, TER: np.inf} -METRICS_REQUIRING_DECODER = [BLEU, CHRF, ROUGE1, ROUGE2, ROUGEL, TER] -EVALUATE_METRICS = [BLEU, CHRF, ROUGE1, ROUGE2, ROUGEL, TER] + TER: False, BOW_PERPLEXITY: False, SIGNWRITING_CLIP: True, SIGNWRITING_SIMILARITY: True} +METRIC_WORST = {ACCURACY: 0.0, BLEU: 0.0, CHRF: 0.0, ROUGE1: 0.0, PERPLEXITY: np.inf, BOW_PERPLEXITY: np.inf, + TER: np.inf, SIGNWRITING_CLIP: -1.0, SIGNWRITING_SIMILARITY: 0.0} +METRICS_REQUIRING_DECODER = [BLEU, CHRF, ROUGE1, ROUGE2, ROUGEL, TER, SIGNWRITING_CLIP, SIGNWRITING_SIMILARITY] +EVALUATE_METRICS = [BLEU, CHRF, ROUGE1, ROUGE2, ROUGEL, TER, SIGNWRITING_CLIP, SIGNWRITING_SIMILARITY] # loss CROSS_ENTROPY = 'cross-entropy' diff --git a/sockeye/evaluate.py b/sockeye/evaluate.py index 5e5c18ce2..47a6db8a6 100644 --- a/sockeye/evaluate.py +++ b/sockeye/evaluate.py @@ -18,7 +18,7 @@ import logging import sys from collections import defaultdict -from functools import partial +from functools import partial, lru_cache from typing import Callable, Iterable, Dict, List, Tuple, Optional import numpy as np @@ -118,6 +118,70 @@ def raw_corpus_length_ratio(hypotheses: Iterable[str], references: Iterable[str] return sum(ratios)/len(ratios) if len(ratios) else 0.0 +def serialize_factors(factors: List[Iterable[str]]) -> List[str]: + factors_list = zip(*factors) + for factors in factors_list: + factors_tokens = [f.strip().split(" ") for f in factors] + inverse_factors = zip(*factors_tokens) + yield " ".join([" ".join(f) for f in inverse_factors]) + + +def detokenize_signwriting(strings: List[str]) -> List[str]: + from signwriting.tokenizer import SignWritingTokenizer + tokenizer = SignWritingTokenizer() + signwriting_texts = [tokenizer.tokens_to_text(s.split(" ")) for s in strings] + # Regex Replace ([RBLM])00 with the capture group + import re + return [re.sub(r"([RBLM])00", r"\1", s) for s in signwriting_texts] + + +def raw_corpus_signwriting_similarity(hypotheses_factors: List[Iterable[str]], + references_factors: List[Iterable[str]]) -> float: + """ + Simple wrapper around the signwriting-evaluation similarity score. + + :param hypotheses_factors: Hypothesis factors streams. + :param references_factors: Reference factors streams. + :return: Similarity score as float between 0 and 1. + """ + try: + from signwriting_evaluation.metrics.similarity import SignWritingSimilarityMetric + except ImportError: + raise ImportError("Please install signwriting-evaluation to use the SignWriting Similarity metric.") + + hypotheses = detokenize_signwriting(serialize_factors(hypotheses_factors)) + references = detokenize_signwriting(serialize_factors(references_factors)) + + metric = SignWritingSimilarityMetric() + return metric.corpus_score(hypotheses, [references]) + + +@lru_cache(maxsize=1) +def load_signwriting_clip(): + try: + from signwriting_evaluation.metrics.clip import SignWritingCLIPScore + except ImportError: + raise ImportError("Please install signwriting-evaluation to use the SignWriting CLIP metric.") + + return SignWritingCLIPScore() + + +def raw_corpus_signwriting_clip(hypotheses_factors: List[Iterable[str]], + references_factors: List[Iterable[str]]) -> float: + """ + Simple wrapper around the signwriting-evaluation clip score. + + :param hypotheses_factors: Hypothesis factors streams. + :param references_factors: Reference factors streams. + :return: CLIPScore score as float between -1 and 1. + """ + metric = load_signwriting_clip() + + hypotheses = detokenize_signwriting(serialize_factors(hypotheses_factors)) + references = detokenize_signwriting(serialize_factors(references_factors)) + return metric.corpus_score(hypotheses, [references]) + + def main(): params = argparse.ArgumentParser(description='Evaluate translations by calculating metrics with ' 'respect to a reference set. If multiple hypotheses files are given ' @@ -163,6 +227,10 @@ def main(): func = raw_corpus_rougel elif name == C.TER: func = raw_corpus_ter + elif name == C.SIGNWRITING_CLIP: + func = raw_corpus_signwriting_clip + elif name == C.SIGNWRITING_SIMILARITY: + func = raw_corpus_signwriting_similarity else: raise ValueError("Unknown metric %s." % name) metrics.append((name, func)) @@ -196,4 +264,4 @@ def _print_mean_std_score(metrics: List[Tuple[str, Callable]], scores: Dict[str, if __name__ == '__main__': - main() + main() \ No newline at end of file From 93099c7ba7695a0f39f9d3e3a7b035664ae94fca Mon Sep 17 00:00:00 2001 From: Amit Moryossef Date: Sat, 10 Feb 2024 17:16:44 +0100 Subject: [PATCH 2/3] chore(): changelog and bump release version --- CHANGELOG.md | 7 +++++++ sockeye/__init__.py | 2 +- sockeye/evaluate.py | 2 +- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 317a51cef..484398eb0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,13 @@ Note that Sockeye has checks in place to not translate with an old model that wa Each version section may have subsections for: _Added_, _Changed_, _Removed_, _Deprecated_, and _Fixed_. +## [3.1.38] + +### Changed + +- Added support for [signwriting-evaluation](https://github.com/sign-language-processing/signwriting-evaluation) to + allow evaluating SignWriting text translation outputs. + ## [3.1.37] ### Fixed diff --git a/sockeye/__init__.py b/sockeye/__init__.py index b017b08c9..80b1416a6 100644 --- a/sockeye/__init__.py +++ b/sockeye/__init__.py @@ -11,4 +11,4 @@ # express or implied. See the License for the specific language governing # permissions and limitations under the License. -__version__ = '3.1.37' +__version__ = '3.1.38' diff --git a/sockeye/evaluate.py b/sockeye/evaluate.py index 47a6db8a6..f88b506f7 100644 --- a/sockeye/evaluate.py +++ b/sockeye/evaluate.py @@ -264,4 +264,4 @@ def _print_mean_std_score(metrics: List[Tuple[str, Callable]], scores: Dict[str, if __name__ == '__main__': - main() \ No newline at end of file + main() From baf2a5afe27f5abea7c816a049888210809f14db Mon Sep 17 00:00:00 2001 From: Amit Moryossef Date: Sat, 2 Mar 2024 13:24:41 +0100 Subject: [PATCH 3/3] fix(evaluate): clip score remove cache directory --- sockeye/evaluate.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sockeye/evaluate.py b/sockeye/evaluate.py index f88b506f7..b013e7b8c 100644 --- a/sockeye/evaluate.py +++ b/sockeye/evaluate.py @@ -163,7 +163,8 @@ def load_signwriting_clip(): except ImportError: raise ImportError("Please install signwriting-evaluation to use the SignWriting CLIP metric.") - return SignWritingCLIPScore() + # Not using cache_directory to avoid multiple processes accessing at the same time + return SignWritingCLIPScore(cache_directory=None) def raw_corpus_signwriting_clip(hypotheses_factors: List[Iterable[str]],