Skip to content

Commit

Permalink
Merge branch 'master' into topic/miurahr/matches/show-files-source-of…
Browse files Browse the repository at this point in the history
…-segment-matches
  • Loading branch information
miurahr authored Dec 21, 2024
2 parents 80b3575 + ffc7616 commit 360e2b5
Show file tree
Hide file tree
Showing 14 changed files with 693 additions and 270 deletions.
347 changes: 185 additions & 162 deletions aligner/src/main/java/org/omegat/gui/align/AlignFilePickerController.java

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -392,6 +392,7 @@ dependencies {
testAcceptanceImplementation(testFixtures(project.rootProject))
testAcceptanceImplementation(libs.assertj.swing.junit)
testAcceptanceImplementation(libs.bundles.jackson)
testAcceptanceImplementation(project(':aligner'))

testIntegrationImplementation sourceSets.main.output, sourceSets.test.output
testIntegrationImplementation(testFixtures(project.rootProject))
Expand Down
2 changes: 1 addition & 1 deletion config/checkstyle/suppressions.xml
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@

<!-- Aligner, AlignFilePickerController, AlignPanelController -->
<suppress files="Aligner\.java" checks="ParameterNumber"/>
<suppress files="AlignFilePickerController\.java|AlignPanelController\.java" checks="MethodLength|FileLength"/>
<suppress files="AlignPanelController\.java" checks="MethodLength|FileLength"/>

<!-- machinetranslators -->
<suppress checks="(DesignForExtension|MagicNumber|MemberName|HideUtilityClassConstructor)"
Expand Down
7 changes: 3 additions & 4 deletions src/org/omegat/core/statistics/CalcMatchStatistics.java
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ public class CalcMatchStatistics extends LongProcessThread {
public CalcMatchStatistics(IStatsConsumer callback, boolean perFile) {
this(Core.getProject(), Core.getSegmenter(), callback, perFile,
Preferences.getPreferenceDefault(Preferences.EXT_TMX_FUZZY_MATCH_THRESHOLD,
OConsts.FUZZY_MATCH_THRESHOLD));
OConsts.FUZZY_MATCH_THRESHOLD));
}

public CalcMatchStatistics(IProject project, Segmenter segmenter, IStatsConsumer callback,
Expand All @@ -119,8 +119,7 @@ public CalcMatchStatistics(IProject project, Segmenter segmenter, IStatsConsumer
this.callback = callback;
this.perFile = perFile;
finder = ThreadLocal.withInitial(
() -> new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, true,
false, false, threshold));
() -> new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, false, threshold));
}

@Override
Expand Down Expand Up @@ -313,7 +312,7 @@ Optional<MatchStatCounts> calcSimilarity(List<SourceTextEntry> untranslatedEntri
int calcMaxSimilarity(SourceTextEntry ste) {
String srcNoXmlTags = removeXmlTags(ste);
FindMatches localFinder = finder.get();
List<NearString> nears = localFinder.search(srcNoXmlTags, true, false, this::isInterrupted);
List<NearString> nears = localFinder.search(srcNoXmlTags, false, this::isInterrupted);
final Token[] strTokensStem = localFinder.tokenizeAll(ste.getSrcText());
int maxSimilarity = 0;
CACHE: for (NearString near : nears) {
Expand Down
102 changes: 66 additions & 36 deletions src/org/omegat/core/statistics/FindMatches.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
2008 Alex Buloichik
2012 Thomas Cordonnier, Martin Fleurke
2013 Aaron Madlon-Kay, Alex Buloichik
2024 Hiroshi Miura
2024 Hiroshi Miura, Thomas Cordonnier
Home page: https://www.omegat.org/
Support center: https://omegat.org/support
Expand Down Expand Up @@ -128,27 +128,15 @@ public class FindMatches {
/** Tokens for original string, includes numbers and tags. */
private Token[] strTokensAll;

// This finder used for search separate segment matches
private FindMatches separateSegmentMatcher;

private final int fuzzyMatchThreshold;

private final boolean applyThreshold;

private final Segmenter segmenter;

/**
* @param searchExactlyTheSame
* allows to search similarities with the same text as source
* segment. This mode used only for separate sentence match in
* paragraph project, i.e. where source is just part of current
* source.
*/
@Deprecated(since = "6.1.0")
public FindMatches(IProject project, int maxCount, boolean allowSeparateSegmentMatch,
boolean searchExactlyTheSame) {
this(project, Core.getSegmenter(), maxCount, allowSeparateSegmentMatch, searchExactlyTheSame, true,
Preferences.getPreferenceDefault(Preferences.EXT_TMX_FUZZY_MATCH_THRESHOLD,
OConsts.FUZZY_MATCH_THRESHOLD));
this(project, Core.getSegmenter(), maxCount, searchExactlyTheSame, Preferences.getPreferenceDefault(
Preferences.EXT_TMX_FUZZY_MATCH_THRESHOLD, OConsts.FUZZY_MATCH_THRESHOLD));
}

/**
Expand All @@ -168,19 +156,21 @@ public FindMatches(IProject project, int maxCount, boolean allowSeparateSegmentM
* @param threshold
* threshold to use.
*/
public FindMatches(IProject project, Segmenter segmenter, int maxCount, boolean allowSeparateSegmentMatch,
boolean searchExactlyTheSame, boolean applyThreshold, int threshold) {
public FindMatches(IProject project, Segmenter segmenter, int maxCount,
boolean searchExactlyTheSame, int threshold) {
this.project = project;
this.segmenter = segmenter;
this.tok = project.getSourceTokenizer();
this.srcLocale = project.getProjectProperties().getSourceLanguage().getLocale();
this.maxCount = maxCount;
this.searchExactlyTheSame = searchExactlyTheSame;
if (allowSeparateSegmentMatch && !project.getProjectProperties().isSentenceSegmentingEnabled()) {
separateSegmentMatcher = new FindMatches(project, segmenter, 1, false, true, true, threshold);
}
this.fuzzyMatchThreshold = threshold;
this.applyThreshold = applyThreshold;
}

@Deprecated(since = "6.1.0")
public List<NearString> search(final String searchText, final boolean requiresTranslation,
final boolean fillSimilarityData, final IStopped stop) throws StoppedException {
return search(searchText, fillSimilarityData, stop);
}

/**
Expand All @@ -197,8 +187,33 @@ public FindMatches(IProject project, Segmenter segmenter, int maxCount, boolean
* @throws StoppedException
* raised when stopped during a search process.
*/
public List<NearString> search(String searchText, boolean requiresTranslation, boolean fillSimilarityData,
IStopped stop) throws StoppedException {
public List<NearString> search(String searchText, boolean fillSimilarityData, IStopped stop)
throws StoppedException {
return search(searchText, fillSimilarityData, stop,
!project.getProjectProperties().isSentenceSegmentingEnabled());
}

/**
* Search Translation memories.
* <p>
* Internal method to handle search conditions.
* It is accessible as package-private for testing.
*
* @param searchText
* target segment or term to search.
* @param fillSimilarityData
* fill similarity data into the result of NearString objects.
* @param stop
* IStopped callback object to indicate cancel operation.
* @param runSeparateSegmentMatch
* Also search with segmented terms search.
* @return
* List of NearString objects.
* @throws StoppedException
* When stopped the process during search.
*/
List<NearString> search(String searchText, boolean fillSimilarityData, IStopped stop,
boolean runSeparateSegmentMatch) throws StoppedException {
result = new ArrayList<>(OConsts.MAX_NEAR_STRINGS + 1);
srcText = searchText;
removedText = "";
Expand Down Expand Up @@ -228,7 +243,7 @@ public List<NearString> search(String searchText, boolean requiresTranslation, b
// skip original==original entry comparison
return;
}
if (requiresTranslation && trans.translation == null) {
if (trans.translation == null) {
return;
}
String fileName = project.isOrphaned(source) ? ORPHANED_FILE_NAME : null;
Expand All @@ -243,7 +258,7 @@ public List<NearString> search(String searchText, boolean requiresTranslation, b
// skip original==original entry comparison
return;
}
if (requiresTranslation && trans.translation == null) {
if (trans.translation == null) {
return;
}
String fileName = project.isOrphaned(source) ? ORPHANED_FILE_NAME : null;
Expand All @@ -257,7 +272,6 @@ public List<NearString> search(String searchText, boolean requiresTranslation, b
*/
int foreignPenalty = Preferences.getPreferenceDefault(Preferences.PENALTY_FOR_FOREIGN_MATCHES,
Preferences.PENALTY_FOR_FOREIGN_MATCHES_DEFAULT);
// travel by translation memories
for (Map.Entry<String, ExternalTMX> en : project.getTransMemories().entrySet()) {
int penalty = 0;
Matcher matcher = SEARCH_FOR_PENALTY.matcher(en.getKey());
Expand All @@ -267,11 +281,11 @@ public List<NearString> search(String searchText, boolean requiresTranslation, b
for (ITMXEntry tmen : en.getValue().getEntries()) {
checkStopped(stop);
if (tmen.getSourceText() == null) {
// Not all TMX entries have a source; in that case there can
// be no meaningful match, so skip.
// Not all TMX entries have a source; skip it in
// the case, because of no meaningful.
continue;
}
if (requiresTranslation && tmen.getTranslationText() == null) {
if (tmen.getTranslationText() == null) {
continue;
}
int tmenPenalty = penalty;
Expand All @@ -292,7 +306,9 @@ public List<NearString> search(String searchText, boolean requiresTranslation, b
ste.isSourceTranslationFuzzy(), 0);
}
}
if (separateSegmentMatcher != null) {
if (runSeparateSegmentMatch) {
FindMatches separateSegmentMatcher = new FindMatches(project, segmenter, 1, true,
fuzzyMatchThreshold);
// split paragraph even when segmentation disabled, then find
// matches for every segment
List<StringBuilder> spaces = new ArrayList<>();
Expand All @@ -304,17 +320,31 @@ public List<NearString> search(String searchText, boolean requiresTranslation, b
Set<String> tmxNames = new HashSet<>();
List<String> fsrc = new ArrayList<>(segments.size());
List<String> ftrans = new ArrayList<>(segments.size());
int maxPenalty = 0;
// multiple segments
for (String onesrc : segments) {
// find match for a separate segment
List<NearString> segmentMatch = separateSegmentMatcher.search(onesrc, requiresTranslation,
false, stop);
// find match for a separate segment.
// WARN: the 5th argument should be
// `false` to avoid an infinite-loop.
List<NearString> segmentMatch = separateSegmentMatcher.search(onesrc, false, stop, false);
if (!segmentMatch.isEmpty()
&& segmentMatch.get(0).scores[0].score >= SUBSEGMENT_MATCH_THRESHOLD) {
fsrc.add(segmentMatch.get(0).source);
ftrans.add(segmentMatch.get(0).translation);
segmentMatch.stream().filter(match -> !match.projs[0].isEmpty())
.map(match -> match.projs[0]).forEach(tmxNames::add);
if (segmentMatch.get(0).fuzzyMark) {
if (maxPenalty < PENALTY_FOR_FUZZY) {
maxPenalty = PENALTY_FOR_FUZZY;
}
}
Matcher matcher = SEARCH_FOR_PENALTY.matcher(segmentMatch.get(0).projs[0]);
if (matcher.find()) {
int penalty = Integer.parseInt(matcher.group(1));
if (penalty > maxPenalty) {
maxPenalty = penalty;
}
}
} else {
fsrc.add("");
ftrans.add("");
Expand All @@ -324,7 +354,7 @@ public List<NearString> search(String searchText, boolean requiresTranslation, b
PrepareTMXEntry entry = new PrepareTMXEntry();
entry.source = segmenter.glue(sourceLang, sourceLang, fsrc, spaces, brules);
entry.translation = segmenter.glue(sourceLang, targetLang, ftrans, spaces, brules);
processEntry(null, entry, String.join(",", tmxNames), NearString.MATCH_SOURCE.TM, false, 0);
processEntry(null, entry, String.join(",", tmxNames), NearString.MATCH_SOURCE.TM, false, maxPenalty);
}
}
// fill similarity data only for a result
Expand Down Expand Up @@ -420,7 +450,7 @@ public void processEntry(EntryKey key, ITMXEntry entry, String tmxName,
}

// BUGS#1236 - stat display does not use threshold config check
if (applyThreshold && similarityStem < fuzzyMatchThreshold
if (fuzzyMatchThreshold > 0 && similarityStem < fuzzyMatchThreshold
&& similarityNoStem < fuzzyMatchThreshold && simAdjusted < fuzzyMatchThreshold) {
return;
}
Expand Down
32 changes: 26 additions & 6 deletions src/org/omegat/gui/matches/FindMatchesThread.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
2008 Alex Buloichik
2012 Thomas Cordonnier, Martin Fleurke
2013 Aaron Madlon-Kay
2024 Hiroshi Miura
Home page: https://www.omegat.org/
Support center: https://omegat.org/support
Expand All @@ -32,17 +33,22 @@
import java.util.List;
import java.util.logging.Logger;

import org.omegat.core.Core;
import org.omegat.core.data.IProject;
import org.omegat.core.data.SourceTextEntry;
import org.omegat.core.events.IStopped;
import org.omegat.core.matching.NearString;
import org.omegat.core.segmentation.Segmenter;
import org.omegat.core.statistics.FindMatches;
import org.omegat.gui.common.EntryInfoSearchThread;
import org.omegat.util.OConsts;
import org.omegat.util.Preferences;

/**
* Find matches in separate thread then show result in the matches pane.
* Find matches in separate thread then show a result in the matches' pane.
*
* @author Alex Buloichik ([email protected])
* @author Hiroshi Miura
*/
public class FindMatchesThread extends EntryInfoSearchThread<List<NearString>> {
private static final Logger LOGGER = Logger.getLogger(FindMatchesThread.class.getName());
Expand All @@ -52,9 +58,9 @@ public class FindMatchesThread extends EntryInfoSearchThread<List<NearString>> {

/**
* Entry which is processed currently.
*
* If entry in controller was changed, it means user has moved to another entry, and there is no sense to
* continue.
* <p>
* If entry in controller was changed, it means the user has moved to
* another entry, and there is no sense to continue.
*/
private final SourceTextEntry processedEntry;

Expand All @@ -79,12 +85,26 @@ protected List<NearString> search() throws Exception {
long before = System.currentTimeMillis();

try {
FindMatches finder = new FindMatches(project, OConsts.MAX_NEAR_STRINGS, true, false);
List<NearString> result = finder.search(processedEntry.getSrcText(), true, true, this::isEntryChanged);
List<NearString> result = finderSearch(project, Core.getSegmenter(), processedEntry.getSrcText(),
this::isEntryChanged, Preferences.getPreferenceDefault(
Preferences.EXT_TMX_FUZZY_MATCH_THRESHOLD, OConsts.FUZZY_MATCH_THRESHOLD));
LOGGER.finer(() -> "Time for find matches: " + (System.currentTimeMillis() - before));
return result;
} catch (FindMatches.StoppedException ex) {
throw new EntryChangedException();
}
}

/**
* Search matches (static for test purpose).
* @param project OmegaT project.
* @param srcText source text to look for.
* @param isEntryChanged stop and raise StopException when it returns true.
* @return result as a list of NearString.
*/
protected static List<NearString> finderSearch(IProject project, Segmenter segmenter, String srcText,
IStopped isEntryChanged, int threshold) {
FindMatches finder = new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, false, threshold);
return finder.search(srcText, true, isEntryChanged);
}
}
Loading

0 comments on commit 360e2b5

Please sign in to comment.