Skip to content

Commit

Permalink
fix: respect foreign match penalty in segmented matches (#1231)
Browse files Browse the repository at this point in the history
* fix: respect foreign match penalty in segmented matches

Signed-off-by: Hiroshi Miura <[email protected]>

* chore: FindMatchesTest: the case of foreign language TMX.

Signed-off-by: Hiroshi Miura <[email protected]>

---------

Signed-off-by: Hiroshi Miura <[email protected]>
  • Loading branch information
miurahr authored Dec 21, 2024
1 parent ffc7616 commit c961d0e
Show file tree
Hide file tree
Showing 5 changed files with 87 additions and 14 deletions.
7 changes: 7 additions & 0 deletions src/org/omegat/core/matching/NearString.java
Original file line number Diff line number Diff line change
Expand Up @@ -204,11 +204,18 @@ public static class Scores {
public final int scoreNoStem;
/** adjusted similarity score for match including all tokens */
public final int adjustedScore;
/** penalty of the match */
public final int penalty;

public Scores(int score, int scoreNoStem, int adjustedScore) {
this(score, scoreNoStem, adjustedScore, 0);
}

public Scores(int score, int scoreNoStem, int adjustedScore, int penalty) {
this.score = score;
this.scoreNoStem = scoreNoStem;
this.adjustedScore = adjustedScore;
this.penalty = penalty;
}

public String toString() {
Expand Down
10 changes: 2 additions & 8 deletions src/org/omegat/core/statistics/FindMatches.java
Original file line number Diff line number Diff line change
Expand Up @@ -333,13 +333,7 @@ List<NearString> search(String searchText, boolean fillSimilarityData, IStopped
maxPenalty = PENALTY_FOR_FUZZY;
}
}
Matcher matcher = SEARCH_FOR_PENALTY.matcher(segmentMatch.get(0).projs[0]);
if (matcher.find()) {
int penalty = Integer.parseInt(matcher.group(1));
if (penalty > maxPenalty) {
maxPenalty = penalty;
}
}
maxPenalty = Math.max(maxPenalty, segmentMatch.get(0).scores[0].penalty);
} else {
fsrc.add("");
ftrans.add("");
Expand Down Expand Up @@ -451,7 +445,7 @@ public void processEntry(EntryKey key, ITMXEntry entry, String tmxName,
}

addNearString(key, entry, comesFrom, fuzzy, new NearString.Scores(similarityStem, similarityNoStem,
simAdjusted), tmxName);
simAdjusted, penalty), tmxName);
}

/**
Expand Down
17 changes: 17 additions & 0 deletions test/data/tmx/segment_2.tmx
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE tmx PUBLIC "-//LISA OSCAR:1998//DTD for Translation Memory eXchange//EN" "tmx14.dtd">

<tmx version="1.4">
<header creationtoolversion="0.1" adminlang="en" segtype="paragraph" creationdate="20230930T155211Z"
datatype="unknown" srclang="ja" creationtool="txt2tmx" o-tmf="TextEdit"></header>
<body>
<tu>
<tuv xml:lang="en">
<seg>weird behavior</seg>
</tuv>
<tuv xml:lang="ja">
<seg>地力の搾取と浪費が現われる。(1)</seg>
</tuv>
</tu>
</body>
</tmx>
65 changes: 61 additions & 4 deletions test/src/org/omegat/core/statistics/FindMatchesTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@

import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Ignore;
import org.junit.Test;

import org.omegat.core.Core;
Expand Down Expand Up @@ -78,6 +77,7 @@ public class FindMatchesTest {
private static final File TMX_EN_US_SR = new File("test/data/tmx/en-US_sr.tmx");
private static final File TMX_EN_US_GB_SR = new File("test/data/tmx/en-US_en-GB_fr_sr.tmx");
private static final File TMX_SEGMENT = new File("test/data/tmx/penalty-010/segment_1.tmx");
private static final File TMX_SEGMENT_2 = new File("test/data/tmx/segment_2.tmx");
private static final File TMX_MULTI = new File("test/data/tmx/test-multiple-entries.tmx");
private static Path tmpDir;

Expand Down Expand Up @@ -220,7 +220,6 @@ public void testSearchRFE1578_2() throws Exception {
assertEquals("ZZZ", result.get(2).translation); // sr
}

@Ignore("Should be enalbed when the bug fix proposed.")
@Test
public void testSearchBUGS1251() throws Exception {
ProjectProperties prop = new ProjectProperties(tmpDir.toFile());
Expand All @@ -231,7 +230,6 @@ public void testSearchBUGS1251() throws Exception {
Segmenter segmenter = new Segmenter(SRX.getDefault());
IProject project = new TestProject(prop, null, TMX_SEGMENT, new LuceneCJKTokenizer(),
new LuceneFrenchTokenizer(), segmenter);
Core.setProject(project);
SourceTextEntry ste = project.getAllEntries().get(1);
Language sourceLanguage = prop.getSourceLanguage();
String srcText = ste.getSrcText();
Expand All @@ -243,10 +241,69 @@ public void testSearchBUGS1251() throws Exception {
FindMatches finder = new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, false, 30);
List<NearString> result = finder.search(srcText, false, iStopped);
assertEquals(srcText, result.get(0).source);
assertEquals(1, result.size());
assertEquals(2, result.size());
// match normal
assertEquals("TM", result.get(0).comesFrom.name());
assertEquals(90, result.get(0).scores[0].score);
assertEquals("weird behavior", result.get(0).translation);
assertTrue(result.get(0).projs[0].contains("penalty-010"));
// match segmented, with penalty
assertEquals("TM", result.get(1).comesFrom.name());
assertEquals(90, result.get(1).scores[0].score);
assertEquals(10, result.get(1).scores[0].penalty);
// FIXME
//assertTrue(result.get(1).projs[0].contains("penalty-010"));
}

@Test
public void testSearchForeign() throws Exception {
ProjectProperties prop = new ProjectProperties(tmpDir.toFile());
prop.setSourceLanguage("ja");
prop.setTargetLanguage("fr");
prop.setSupportDefaultTranslations(true);
prop.setSentenceSegmentingEnabled(false);
Segmenter segmenter = new Segmenter(SRX.getDefault());
// external TMX is ja-en
IProject project = new TestProject(prop, null, TMX_SEGMENT_2, new LuceneCJKTokenizer(),
new LuceneFrenchTokenizer(), segmenter);
SourceTextEntry ste = project.getAllEntries().get(1);
String srcText = ste.getSrcText();
IStopped iStopped = () -> false;
FindMatches finder = new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, false, 30);
List<NearString> result = finder.search(srcText, false, iStopped);
assertEquals(1, result.size());
assertEquals(srcText, result.get(0).source);
int foreignPenalty = Preferences.PENALTY_FOR_FOREIGN_MATCHES_DEFAULT;
assertEquals(foreignPenalty, result.get(0).scores[0].penalty);
}

@Test
public void testSearchForeignSegmented() throws Exception {
ProjectProperties prop = new ProjectProperties(tmpDir.toFile());
prop.setSourceLanguage("en");
prop.setTargetLanguage("fr");
prop.setSupportDefaultTranslations(true);
prop.setSentenceSegmentingEnabled(false);
Segmenter segmenter = new Segmenter(SRX.getDefault());
IProject project = new TestProject(prop, null, TMX_MATCH_EN_CA, new LuceneEnglishTokenizer(),
new DefaultTokenizer(), segmenter);
IStopped iStopped = () -> false;
String srcText = "This badge is granted when you’ve invited 5 people who subsequently spent enough "
+ "time on the site to become full members. "
+ "Wow! "
+ "Thanks for expanding the diversity of our community with new members!";
FindMatches finder = new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, false, 30);
List<NearString> result = finder.search(srcText, false, iStopped);
assertEquals(2, result.size());
assertEquals("Hit with segmented tmx record", 35, result.get(0).scores[0].score);
assertEquals(35, result.get(0).scores[0].score);
assertEquals(32, result.get(0).scores[0].scoreNoStem);
assertEquals(32, result.get(0).scores[0].adjustedScore);
// a foreign and segmented match
assertEquals(21, result.get(1).scores[0].scoreNoStem);
assertEquals(35, result.get(1).scores[0].adjustedScore);
int foreignPenalty = Preferences.PENALTY_FOR_FOREIGN_MATCHES_DEFAULT;
assertEquals(foreignPenalty, result.get(1).scores[0].penalty);
}

@Test
Expand Down
2 changes: 0 additions & 2 deletions test/src/org/omegat/gui/matches/FindMatchesThreadTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@
import org.junit.AfterClass;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Ignore;
import org.junit.Test;

import org.omegat.core.Core;
Expand Down Expand Up @@ -89,7 +88,6 @@ public void setUp() throws Exception {
Core.registerTokenizerClass(LuceneEnglishTokenizer.class);
}

@Ignore("Should be enalbed when the bug fix proposed.")
@Test
public void testSearchBUGS1248() throws Exception {
ProjectProperties prop = new ProjectProperties(tmpDir.toFile());
Expand Down

0 comments on commit c961d0e

Please sign in to comment.