From f070c8c96c543064a28ef58a5a5b29d6ed206798 Mon Sep 17 00:00:00 2001
From: Amit Moryossef <amitmoryossef@gmail.com>
Date: Sat, 10 Feb 2024 17:05:39 +0100
Subject: [PATCH 1/3] feat(evaluate): add signwriting evaluation metrics

---
 sockeye/checkpoint_decoder.py | 37 ++++++++++--------
 sockeye/constants.py          | 14 ++++---
 sockeye/evaluate.py           | 72 ++++++++++++++++++++++++++++++++++-
 3 files changed, 101 insertions(+), 22 deletions(-)

diff --git a/sockeye/checkpoint_decoder.py b/sockeye/checkpoint_decoder.py
index 922682eb0..b3a4e819c 100644
--- a/sockeye/checkpoint_decoder.py
+++ b/sockeye/checkpoint_decoder.py
@@ -181,25 +181,32 @@ def decode_and_evaluate(self, output_name: Optional[str] = None) -> Dict[str, fl
         self.model.train(original_mode)
 
         # 2. Evaluate
-
-        metrics = {C.BLEU: evaluate.raw_corpus_bleu(hypotheses=translations[0],
-                                                    references=self.targets_sentences[0],
+        hypotheses = translations[0]
+        references = self.targets_sentences[0]
+        metrics = {C.BLEU: evaluate.raw_corpus_bleu(hypotheses=hypotheses, references=references,
                                                     offset=evaluate.DEFAULT_OFFSET),
-                   C.CHRF: evaluate.raw_corpus_chrf(hypotheses=translations[0],
-                                                    references=self.targets_sentences[0]),
-                   C.ROUGE1: evaluate.raw_corpus_rouge1(hypotheses=translations[0],
-                                                        references=self.targets_sentences[0]),
-                   C.ROUGE2: evaluate.raw_corpus_rouge2(hypotheses=translations[0],
-                                                        references=self.targets_sentences[0]),
-                   C.ROUGEL: evaluate.raw_corpus_rougel(hypotheses=translations[0],
-                                                        references=self.targets_sentences[0]),
-                   C.LENRATIO: evaluate.raw_corpus_length_ratio(hypotheses=translations[0],
-                                                                references=self.targets_sentences[0]),
-                   C.TER: evaluate.raw_corpus_ter(hypotheses=translations[0],
-                                                  references=self.targets_sentences[0]),
+                   C.CHRF: evaluate.raw_corpus_chrf(hypotheses=hypotheses, references=references),
+                   C.ROUGE1: evaluate.raw_corpus_rouge1(hypotheses=hypotheses, references=references),
+                   C.ROUGE2: evaluate.raw_corpus_rouge2(hypotheses=hypotheses, references=references),
+                   C.ROUGEL: evaluate.raw_corpus_rougel(hypotheses=hypotheses, references=references),
+                   C.LENRATIO: evaluate.raw_corpus_length_ratio(hypotheses=hypotheses, references=references),
+                   C.TER: evaluate.raw_corpus_ter(hypotheses=hypotheses, references=references),
                    C.AVG_TIME: avg_time,
                    C.DECODING_TIME: trans_wall_time}
 
+        # Add SignWriting Evaluation Metrics if the module is available
+        try:
+            import signwriting_evaluation
+            metrics.update({
+                C.SIGNWRITING_CLIP: evaluate.raw_corpus_signwriting_clip(
+                    hypotheses_factors=translations,
+                    references_factors=self.targets_sentences),
+                C.SIGNWRITING_SIMILARITY: evaluate.raw_corpus_signwriting_similarity(
+                    hypotheses_factors=translations,
+                    references_factors=self.targets_sentences)})
+        except ModuleNotFoundError:
+            pass
+
         if len(translations) > 1:  # metrics for other target factors
             for i, _ in enumerate(translations[1:], 1):
                 # only BLEU
diff --git a/sockeye/constants.py b/sockeye/constants.py
index 0729f2f92..0da5c21f7 100644
--- a/sockeye/constants.py
+++ b/sockeye/constants.py
@@ -261,15 +261,19 @@
 ROUGEL = 'rougel'
 BOW_PERPLEXITY = 'bow-perplexity'
 TER = 'ter'
+SIGNWRITING_CLIP = 'signwriting-clip'
+SIGNWRITING_SIMILARITY = 'signwriting-similarity'
 LENRATIO = 'length-ratio-mse'
 AVG_TIME = "avg-sec-per-sent"
 DECODING_TIME = "decode-walltime"
-METRICS = [PERPLEXITY, ACCURACY, LENRATIO_MSE, BLEU, CHRF, ROUGE1, BOW_PERPLEXITY, TER]
+METRICS = [PERPLEXITY, ACCURACY, LENRATIO_MSE, BLEU, CHRF, ROUGE1, BOW_PERPLEXITY, TER,
+           SIGNWRITING_CLIP, SIGNWRITING_SIMILARITY]
 METRIC_MAXIMIZE = {ACCURACY: True, BLEU: True, CHRF: True, ROUGE1: True, PERPLEXITY: False, LENRATIO_MSE: False,
-                   TER: False, BOW_PERPLEXITY: False}
-METRIC_WORST = {ACCURACY: 0.0, BLEU: 0.0, CHRF: 0.0, ROUGE1: 0.0, PERPLEXITY: np.inf, BOW_PERPLEXITY: np.inf, TER: np.inf}
-METRICS_REQUIRING_DECODER = [BLEU, CHRF, ROUGE1, ROUGE2, ROUGEL, TER]
-EVALUATE_METRICS = [BLEU, CHRF, ROUGE1, ROUGE2, ROUGEL, TER]
+                   TER: False, BOW_PERPLEXITY: False, SIGNWRITING_CLIP: True, SIGNWRITING_SIMILARITY: True}
+METRIC_WORST = {ACCURACY: 0.0, BLEU: 0.0, CHRF: 0.0, ROUGE1: 0.0, PERPLEXITY: np.inf, BOW_PERPLEXITY: np.inf,
+                TER: np.inf, SIGNWRITING_CLIP: -1.0, SIGNWRITING_SIMILARITY: 0.0}
+METRICS_REQUIRING_DECODER = [BLEU, CHRF, ROUGE1, ROUGE2, ROUGEL, TER, SIGNWRITING_CLIP, SIGNWRITING_SIMILARITY]
+EVALUATE_METRICS = [BLEU, CHRF, ROUGE1, ROUGE2, ROUGEL, TER, SIGNWRITING_CLIP, SIGNWRITING_SIMILARITY]
 
 # loss
 CROSS_ENTROPY = 'cross-entropy'
diff --git a/sockeye/evaluate.py b/sockeye/evaluate.py
index 5e5c18ce2..47a6db8a6 100644
--- a/sockeye/evaluate.py
+++ b/sockeye/evaluate.py
@@ -18,7 +18,7 @@
 import logging
 import sys
 from collections import defaultdict
-from functools import partial
+from functools import partial, lru_cache
 from typing import Callable, Iterable, Dict, List, Tuple, Optional
 
 import numpy as np
@@ -118,6 +118,70 @@ def raw_corpus_length_ratio(hypotheses: Iterable[str], references: Iterable[str]
     return sum(ratios)/len(ratios) if len(ratios) else 0.0
 
 
+def serialize_factors(factors: List[Iterable[str]]) -> List[str]:
+    factors_list = zip(*factors)
+    for factors in factors_list:
+        factors_tokens = [f.strip().split(" ") for f in factors]
+        inverse_factors = zip(*factors_tokens)
+        yield " ".join([" ".join(f) for f in inverse_factors])
+
+
+def detokenize_signwriting(strings: List[str]) -> List[str]:
+    from signwriting.tokenizer import SignWritingTokenizer
+    tokenizer = SignWritingTokenizer()
+    signwriting_texts = [tokenizer.tokens_to_text(s.split(" ")) for s in strings]
+    # Regex Replace ([RBLM])00 with the capture group
+    import re
+    return [re.sub(r"([RBLM])00", r"\1", s) for s in signwriting_texts]
+
+
+def raw_corpus_signwriting_similarity(hypotheses_factors: List[Iterable[str]],
+                                      references_factors: List[Iterable[str]]) -> float:
+    """
+    Simple wrapper around the signwriting-evaluation similarity score.
+
+    :param hypotheses_factors: Hypothesis factors streams.
+    :param references_factors: Reference factors streams.
+    :return: Similarity score as float between 0 and 1.
+    """
+    try:
+        from signwriting_evaluation.metrics.similarity import SignWritingSimilarityMetric
+    except ImportError:
+        raise ImportError("Please install signwriting-evaluation to use the SignWriting Similarity metric.")
+
+    hypotheses = detokenize_signwriting(serialize_factors(hypotheses_factors))
+    references = detokenize_signwriting(serialize_factors(references_factors))
+
+    metric = SignWritingSimilarityMetric()
+    return metric.corpus_score(hypotheses, [references])
+
+
+@lru_cache(maxsize=1)
+def load_signwriting_clip():
+    try:
+        from signwriting_evaluation.metrics.clip import SignWritingCLIPScore
+    except ImportError:
+        raise ImportError("Please install signwriting-evaluation to use the SignWriting CLIP metric.")
+
+    return SignWritingCLIPScore()
+
+
+def raw_corpus_signwriting_clip(hypotheses_factors: List[Iterable[str]],
+                                references_factors: List[Iterable[str]]) -> float:
+    """
+    Simple wrapper around the signwriting-evaluation clip score.
+
+    :param hypotheses_factors: Hypothesis factors streams.
+    :param references_factors: Reference factors streams.
+    :return: CLIPScore score as float between -1 and 1.
+    """
+    metric = load_signwriting_clip()
+
+    hypotheses = detokenize_signwriting(serialize_factors(hypotheses_factors))
+    references = detokenize_signwriting(serialize_factors(references_factors))
+    return metric.corpus_score(hypotheses, [references])
+
+
 def main():
     params = argparse.ArgumentParser(description='Evaluate translations by calculating metrics with '
                                                  'respect to a reference set. If multiple hypotheses files are given '
@@ -163,6 +227,10 @@ def main():
             func = raw_corpus_rougel
         elif name == C.TER:
             func = raw_corpus_ter
+        elif name == C.SIGNWRITING_CLIP:
+            func = raw_corpus_signwriting_clip
+        elif name == C.SIGNWRITING_SIMILARITY:
+            func = raw_corpus_signwriting_similarity
         else:
             raise ValueError("Unknown metric %s." % name)
         metrics.append((name, func))
@@ -196,4 +264,4 @@ def _print_mean_std_score(metrics: List[Tuple[str, Callable]], scores: Dict[str,
 
 
 if __name__ == '__main__':
-    main()
+    main()
\ No newline at end of file

From 93099c7ba7695a0f39f9d3e3a7b035664ae94fca Mon Sep 17 00:00:00 2001
From: Amit Moryossef <amitmoryossef@gmail.com>
Date: Sat, 10 Feb 2024 17:16:44 +0100
Subject: [PATCH 2/3] chore(): changelog and bump release version

---
 CHANGELOG.md        | 7 +++++++
 sockeye/__init__.py | 2 +-
 sockeye/evaluate.py | 2 +-
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 317a51cef..484398eb0 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,6 +11,13 @@ Note that Sockeye has checks in place to not translate with an old model that wa
 
 Each version section may have subsections for: _Added_, _Changed_, _Removed_, _Deprecated_, and _Fixed_.
 
+## [3.1.38]
+
+### Changed
+
+- Added support for [signwriting-evaluation](https://github.com/sign-language-processing/signwriting-evaluation) to
+  allow evaluating SignWriting text translation outputs.
+
 ## [3.1.37]
 
 ### Fixed
diff --git a/sockeye/__init__.py b/sockeye/__init__.py
index b017b08c9..80b1416a6 100644
--- a/sockeye/__init__.py
+++ b/sockeye/__init__.py
@@ -11,4 +11,4 @@
 # express or implied. See the License for the specific language governing
 # permissions and limitations under the License.
 
-__version__ = '3.1.37'
+__version__ = '3.1.38'
diff --git a/sockeye/evaluate.py b/sockeye/evaluate.py
index 47a6db8a6..f88b506f7 100644
--- a/sockeye/evaluate.py
+++ b/sockeye/evaluate.py
@@ -264,4 +264,4 @@ def _print_mean_std_score(metrics: List[Tuple[str, Callable]], scores: Dict[str,
 
 
 if __name__ == '__main__':
-    main()
\ No newline at end of file
+    main()

From baf2a5afe27f5abea7c816a049888210809f14db Mon Sep 17 00:00:00 2001
From: Amit Moryossef <amitmoryossef@gmail.com>
Date: Sat, 2 Mar 2024 13:24:41 +0100
Subject: [PATCH 3/3] fix(evaluate): clip score remove cache directory

---
 sockeye/evaluate.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sockeye/evaluate.py b/sockeye/evaluate.py
index f88b506f7..b013e7b8c 100644
--- a/sockeye/evaluate.py
+++ b/sockeye/evaluate.py
@@ -163,7 +163,8 @@ def load_signwriting_clip():
     except ImportError:
         raise ImportError("Please install signwriting-evaluation to use the SignWriting CLIP metric.")
 
-    return SignWritingCLIPScore()
+    # Not using cache_directory to avoid multiple processes accessing at the same time
+    return SignWritingCLIPScore(cache_directory=None)
 
 
 def raw_corpus_signwriting_clip(hypotheses_factors: List[Iterable[str]],