From c961d0e04f29f45e84bb4e6473450c53bfa04564 Mon Sep 17 00:00:00 2001 From: Hiroshi Miura Date: Sat, 21 Dec 2024 14:10:52 +0900 Subject: [PATCH] fix: respect foreign match penalty in segmented matches (#1231) * fix: respect foreign match penalty in segmented matches Signed-off-by: Hiroshi Miura * chore: FindMatchesTest: the case of foreign language TMX. Signed-off-by: Hiroshi Miura --------- Signed-off-by: Hiroshi Miura --- src/org/omegat/core/matching/NearString.java | 7 ++ .../omegat/core/statistics/FindMatches.java | 10 +-- test/data/tmx/segment_2.tmx | 17 +++++ .../core/statistics/FindMatchesTest.java | 65 +++++++++++++++++-- .../gui/matches/FindMatchesThreadTest.java | 2 - 5 files changed, 87 insertions(+), 14 deletions(-) create mode 100644 test/data/tmx/segment_2.tmx diff --git a/src/org/omegat/core/matching/NearString.java b/src/org/omegat/core/matching/NearString.java index 827da2f4e5..651f63e1e5 100644 --- a/src/org/omegat/core/matching/NearString.java +++ b/src/org/omegat/core/matching/NearString.java @@ -204,11 +204,18 @@ public static class Scores { public final int scoreNoStem; /** adjusted similarity score for match including all tokens */ public final int adjustedScore; + /** penalty of the match */ + public final int penalty; public Scores(int score, int scoreNoStem, int adjustedScore) { + this(score, scoreNoStem, adjustedScore, 0); + } + + public Scores(int score, int scoreNoStem, int adjustedScore, int penalty) { this.score = score; this.scoreNoStem = scoreNoStem; this.adjustedScore = adjustedScore; + this.penalty = penalty; } public String toString() { diff --git a/src/org/omegat/core/statistics/FindMatches.java b/src/org/omegat/core/statistics/FindMatches.java index a9bc2b2e47..7999fdef6b 100644 --- a/src/org/omegat/core/statistics/FindMatches.java +++ b/src/org/omegat/core/statistics/FindMatches.java @@ -333,13 +333,7 @@ List search(String searchText, boolean fillSimilarityData, IStopped maxPenalty = PENALTY_FOR_FUZZY; } } - Matcher matcher = SEARCH_FOR_PENALTY.matcher(segmentMatch.get(0).projs[0]); - if (matcher.find()) { - int penalty = Integer.parseInt(matcher.group(1)); - if (penalty > maxPenalty) { - maxPenalty = penalty; - } - } + maxPenalty = Math.max(maxPenalty, segmentMatch.get(0).scores[0].penalty); } else { fsrc.add(""); ftrans.add(""); @@ -451,7 +445,7 @@ public void processEntry(EntryKey key, ITMXEntry entry, String tmxName, } addNearString(key, entry, comesFrom, fuzzy, new NearString.Scores(similarityStem, similarityNoStem, - simAdjusted), tmxName); + simAdjusted, penalty), tmxName); } /** diff --git a/test/data/tmx/segment_2.tmx b/test/data/tmx/segment_2.tmx new file mode 100644 index 0000000000..f4d3aec761 --- /dev/null +++ b/test/data/tmx/segment_2.tmx @@ -0,0 +1,17 @@ + + + + +
+ + + + weird behavior + + + 地力の搾取と浪費が現われる。(1) + + + +
diff --git a/test/src/org/omegat/core/statistics/FindMatchesTest.java b/test/src/org/omegat/core/statistics/FindMatchesTest.java index 128be5dc9c..2936423918 100644 --- a/test/src/org/omegat/core/statistics/FindMatchesTest.java +++ b/test/src/org/omegat/core/statistics/FindMatchesTest.java @@ -42,7 +42,6 @@ import org.junit.Before; import org.junit.BeforeClass; -import org.junit.Ignore; import org.junit.Test; import org.omegat.core.Core; @@ -78,6 +77,7 @@ public class FindMatchesTest { private static final File TMX_EN_US_SR = new File("test/data/tmx/en-US_sr.tmx"); private static final File TMX_EN_US_GB_SR = new File("test/data/tmx/en-US_en-GB_fr_sr.tmx"); private static final File TMX_SEGMENT = new File("test/data/tmx/penalty-010/segment_1.tmx"); + private static final File TMX_SEGMENT_2 = new File("test/data/tmx/segment_2.tmx"); private static final File TMX_MULTI = new File("test/data/tmx/test-multiple-entries.tmx"); private static Path tmpDir; @@ -220,7 +220,6 @@ public void testSearchRFE1578_2() throws Exception { assertEquals("ZZZ", result.get(2).translation); // sr } - @Ignore("Should be enalbed when the bug fix proposed.") @Test public void testSearchBUGS1251() throws Exception { ProjectProperties prop = new ProjectProperties(tmpDir.toFile()); @@ -231,7 +230,6 @@ public void testSearchBUGS1251() throws Exception { Segmenter segmenter = new Segmenter(SRX.getDefault()); IProject project = new TestProject(prop, null, TMX_SEGMENT, new LuceneCJKTokenizer(), new LuceneFrenchTokenizer(), segmenter); - Core.setProject(project); SourceTextEntry ste = project.getAllEntries().get(1); Language sourceLanguage = prop.getSourceLanguage(); String srcText = ste.getSrcText(); @@ -243,10 +241,69 @@ public void testSearchBUGS1251() throws Exception { FindMatches finder = new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, false, 30); List result = finder.search(srcText, false, iStopped); assertEquals(srcText, result.get(0).source); - assertEquals(1, result.size()); + assertEquals(2, result.size()); + // match normal assertEquals("TM", result.get(0).comesFrom.name()); assertEquals(90, result.get(0).scores[0].score); assertEquals("weird behavior", result.get(0).translation); + assertTrue(result.get(0).projs[0].contains("penalty-010")); + // match segmented, with penalty + assertEquals("TM", result.get(1).comesFrom.name()); + assertEquals(90, result.get(1).scores[0].score); + assertEquals(10, result.get(1).scores[0].penalty); + // FIXME + //assertTrue(result.get(1).projs[0].contains("penalty-010")); + } + + @Test + public void testSearchForeign() throws Exception { + ProjectProperties prop = new ProjectProperties(tmpDir.toFile()); + prop.setSourceLanguage("ja"); + prop.setTargetLanguage("fr"); + prop.setSupportDefaultTranslations(true); + prop.setSentenceSegmentingEnabled(false); + Segmenter segmenter = new Segmenter(SRX.getDefault()); + // external TMX is ja-en + IProject project = new TestProject(prop, null, TMX_SEGMENT_2, new LuceneCJKTokenizer(), + new LuceneFrenchTokenizer(), segmenter); + SourceTextEntry ste = project.getAllEntries().get(1); + String srcText = ste.getSrcText(); + IStopped iStopped = () -> false; + FindMatches finder = new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, false, 30); + List result = finder.search(srcText, false, iStopped); + assertEquals(1, result.size()); + assertEquals(srcText, result.get(0).source); + int foreignPenalty = Preferences.PENALTY_FOR_FOREIGN_MATCHES_DEFAULT; + assertEquals(foreignPenalty, result.get(0).scores[0].penalty); + } + + @Test + public void testSearchForeignSegmented() throws Exception { + ProjectProperties prop = new ProjectProperties(tmpDir.toFile()); + prop.setSourceLanguage("en"); + prop.setTargetLanguage("fr"); + prop.setSupportDefaultTranslations(true); + prop.setSentenceSegmentingEnabled(false); + Segmenter segmenter = new Segmenter(SRX.getDefault()); + IProject project = new TestProject(prop, null, TMX_MATCH_EN_CA, new LuceneEnglishTokenizer(), + new DefaultTokenizer(), segmenter); + IStopped iStopped = () -> false; + String srcText = "This badge is granted when you’ve invited 5 people who subsequently spent enough " + + "time on the site to become full members. " + + "Wow! " + + "Thanks for expanding the diversity of our community with new members!"; + FindMatches finder = new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, false, 30); + List result = finder.search(srcText, false, iStopped); + assertEquals(2, result.size()); + assertEquals("Hit with segmented tmx record", 35, result.get(0).scores[0].score); + assertEquals(35, result.get(0).scores[0].score); + assertEquals(32, result.get(0).scores[0].scoreNoStem); + assertEquals(32, result.get(0).scores[0].adjustedScore); + // a foreign and segmented match + assertEquals(21, result.get(1).scores[0].scoreNoStem); + assertEquals(35, result.get(1).scores[0].adjustedScore); + int foreignPenalty = Preferences.PENALTY_FOR_FOREIGN_MATCHES_DEFAULT; + assertEquals(foreignPenalty, result.get(1).scores[0].penalty); } @Test diff --git a/test/src/org/omegat/gui/matches/FindMatchesThreadTest.java b/test/src/org/omegat/gui/matches/FindMatchesThreadTest.java index 81246276e8..fce9961136 100644 --- a/test/src/org/omegat/gui/matches/FindMatchesThreadTest.java +++ b/test/src/org/omegat/gui/matches/FindMatchesThreadTest.java @@ -43,7 +43,6 @@ import org.junit.AfterClass; import org.junit.Before; import org.junit.BeforeClass; -import org.junit.Ignore; import org.junit.Test; import org.omegat.core.Core; @@ -89,7 +88,6 @@ public void setUp() throws Exception { Core.registerTokenizerClass(LuceneEnglishTokenizer.class); } - @Ignore("Should be enalbed when the bug fix proposed.") @Test public void testSearchBUGS1248() throws Exception { ProjectProperties prop = new ProjectProperties(tmpDir.toFile());