From 9457adbab0af51f3cb725ce943212986edd99fb4 Mon Sep 17 00:00:00 2001 From: uuqjz Date: Thu, 10 Aug 2023 17:13:05 +0200 Subject: [PATCH] Added neighbor record, applied naming convention, changed CLI method for MergingParamters --- cli/src/main/java/de/jplag/cli/CLI.java | 3 +- .../main/java/de/jplag/cli/CliOptions.java | 2 +- .../java/de/jplag/merging/MatchMerging.java | 108 +++++++++--------- .../main/java/de/jplag/merging/Neighbor.java | 9 ++ 4 files changed, 66 insertions(+), 56 deletions(-) create mode 100644 core/src/main/java/de/jplag/merging/Neighbor.java diff --git a/cli/src/main/java/de/jplag/cli/CLI.java b/cli/src/main/java/de/jplag/cli/CLI.java index bdcc2b284..50b2f6a6e 100644 --- a/cli/src/main/java/de/jplag/cli/CLI.java +++ b/cli/src/main/java/de/jplag/cli/CLI.java @@ -223,8 +223,7 @@ private static ClusteringOptions getClusteringOptions(CliOptions options) { } private static MergingParameters getMergingParameters(CliOptions options) { - return new MergingParameters().withEnable(options.merging.enable).withMergeBuffer(options.merging.mergeBuffer) - .withSeperatingThreshold(options.merging.seperatingThreshold); + return new MergingParameters(options.merging.enable, options.merging.mergeBuffer, options.merging.seperatingThreshold); } private String generateDescription() { diff --git a/cli/src/main/java/de/jplag/cli/CliOptions.java b/cli/src/main/java/de/jplag/cli/CliOptions.java index 02fd50ea2..c224f6ec4 100644 --- a/cli/src/main/java/de/jplag/cli/CliOptions.java +++ b/cli/src/main/java/de/jplag/cli/CliOptions.java @@ -91,7 +91,7 @@ public static class Advanced { } public static class Clustering { - @Option(names = {"--cluster-skip"}, description = "Skips the clustering (default: false)\n") + @Option(names = {"--cluster-skip"}, description = "Skips the clustering (default: false)%n") public boolean disable; @ArgGroup diff --git a/core/src/main/java/de/jplag/merging/MatchMerging.java b/core/src/main/java/de/jplag/merging/MatchMerging.java index a7866895a..f4f7633c6 100644 --- a/core/src/main/java/de/jplag/merging/MatchMerging.java +++ b/core/src/main/java/de/jplag/merging/MatchMerging.java @@ -1,7 +1,6 @@ package de.jplag.merging; import java.util.ArrayList; -import java.util.Arrays; import java.util.Collections; import java.util.List; @@ -14,11 +13,12 @@ /** * This class implements a match merging algorithm which serves as defense mechanism against obfuscation attacks. Based - * on configurable parameters MergeBuffer and SeperatingThreshold, it alters prior results and merges all neighboring - * matches that fit the specified thresholds. When neighboring matches get merged they become one and the tokens - * separating them get removed from the submission clone. MergeBuffer describes how shorter a match can be than the - * Minimum Token Match. SeperatingThreshold describes how many tokens can be between two neighboring matches. Both are - * set in {@link JPlagOptions} as {@link MergingParameters} and default to 0 (which deactivates merging). + * on configurable parameters MergeBuffer and SeperatingThreshold, it alters prior results from pairwise submission + * comparisons and merges all neighboring matches that fit the specified thresholds. Submissions are referred to as left + * and right and neighboring matches as upper and lower. When neighboring matches get merged they become one and the + * tokens separating them get removed from the submission clone. MergeBuffer describes how shorter a match can be than + * the Minimum Token Match. SeperatingThreshold describes how many tokens can be between two neighboring matches. Both + * are set in {@link JPlagOptions} as {@link MergingParameters} and default to 0 (which deactivates merging). */ public class MatchMerging { private JPlagOptions options; @@ -44,12 +44,12 @@ public JPlagResult mergeMatchesOf(JPlagResult result) { List comparisonsMerged = new ArrayList<>(); for (JPlagComparison comparison : comparisons) { - Submission firstSubmission = comparison.firstSubmission().copy(); - Submission secondSubmission = comparison.secondSubmission().copy(); + Submission leftSubmission = comparison.firstSubmission().copy(); + Submission rightSubmission = comparison.secondSubmission().copy(); List globalMatches = new ArrayList<>(comparison.matches()); globalMatches.addAll(comparison.ignoredMatches()); - globalMatches = removeTooShortMatches(mergeNeighbors(globalMatches, firstSubmission, secondSubmission)); - comparisonsMerged.add(new JPlagComparison(firstSubmission, secondSubmission, globalMatches, new ArrayList<>())); + globalMatches = removeTooShortMatches(mergeNeighbors(globalMatches, leftSubmission, rightSubmission)); + comparisonsMerged.add(new JPlagComparison(leftSubmission, rightSubmission, globalMatches, new ArrayList<>())); } long durationInMillis = System.currentTimeMillis() - timeBeforeStartInMillis; @@ -57,20 +57,20 @@ public JPlagResult mergeMatchesOf(JPlagResult result) { } /** - * Computes neighbors by sorting based on order of matches in the first and the second submission and then checking - * which are next to each other in both. + * Computes neighbors by sorting based on order of matches in the left and right submissions and then checking which are + * next to each other in both. * @param globalMatches * @return neighbors containing a list of pairs of neighboring matches */ - private List> computeNeighbors(List globalMatches) { - List> neighbors = new ArrayList<>(); - List sortedByFirst = new ArrayList<>(globalMatches); - Collections.sort(sortedByFirst, (match1, match2) -> match1.startOfFirst() - match2.startOfFirst()); - List sortedBySecond = new ArrayList<>(globalMatches); - Collections.sort(sortedBySecond, (match1, match2) -> match1.startOfSecond() - match2.startOfSecond()); - for (int i = 0; i < sortedByFirst.size() - 1; i++) { - if (sortedBySecond.indexOf(sortedByFirst.get(i)) == (sortedBySecond.indexOf(sortedByFirst.get(i + 1)) - 1)) { - neighbors.add(Arrays.asList(sortedByFirst.get(i), sortedByFirst.get(i + 1))); + private List computeNeighbors(List globalMatches) { + List neighbors = new ArrayList<>(); + List sortedByLeft = new ArrayList<>(globalMatches); + Collections.sort(sortedByLeft, (match1, match2) -> match1.startOfFirst() - match2.startOfFirst()); + List sortedByRight = new ArrayList<>(globalMatches); + Collections.sort(sortedByRight, (match1, match2) -> match1.startOfSecond() - match2.startOfSecond()); + for (int i = 0; i < sortedByLeft.size() - 1; i++) { + if (sortedByRight.indexOf(sortedByLeft.get(i)) == (sortedByRight.indexOf(sortedByLeft.get(i + 1)) - 1)) { + neighbors.add(new Neighbor(sortedByLeft.get(i), sortedByLeft.get(i + 1))); } } return neighbors; @@ -82,25 +82,25 @@ private List> computeNeighbors(List globalMatches) { * criteria * @return globalMatches containing merged matches. */ - private List mergeNeighbors(List globalMatches, Submission firstSubmission, Submission secondSubmission) { + private List mergeNeighbors(List globalMatches, Submission leftSubmission, Submission rightSubmission) { int i = 0; - List> neighbors = computeNeighbors(globalMatches); + List neighbors = computeNeighbors(globalMatches); while (i < neighbors.size()) { - Match firstNeighbor = neighbors.get(i).get(0); - Match secondNeighbor = neighbors.get(i).get(1); - - int lengthUpper = firstNeighbor.length(); - int lengthLower = secondNeighbor.length(); - int tokenBetweenFirst = secondNeighbor.startOfFirst() - firstNeighbor.endOfFirst() - 1; - int tokensBetweenSecond = secondNeighbor.startOfSecond() - firstNeighbor.endOfSecond() - 1; - double averageTokensBetweenMatches = (tokenBetweenFirst + tokensBetweenSecond) / 2.0; + Match upperNeighbor = neighbors.get(i).upperMatch(); + Match lowerNeighbor = neighbors.get(i).lowerMatch(); + + int lengthUpper = upperNeighbor.length(); + int lengthLower = lowerNeighbor.length(); + int tokenBetweenLeft = lowerNeighbor.startOfFirst() - upperNeighbor.endOfFirst() - 1; + int tokensBetweenRight = lowerNeighbor.startOfSecond() - upperNeighbor.endOfSecond() - 1; + double averageTokensBetweenMatches = (tokenBetweenLeft + tokensBetweenRight) / 2.0; // Checking length is not necessary as GST already checked length while computing matches if (averageTokensBetweenMatches <= options.mergingParameters().seperatingThreshold()) { - globalMatches.removeAll(neighbors.get(i)); - globalMatches.add(new Match(firstNeighbor.startOfFirst(), firstNeighbor.startOfSecond(), lengthUpper + lengthLower)); - globalMatches = removeToken(globalMatches, firstSubmission, secondSubmission, firstNeighbor.startOfFirst(), - firstNeighbor.startOfSecond(), lengthUpper, tokenBetweenFirst, tokensBetweenSecond); + globalMatches.remove(upperNeighbor); + globalMatches.remove(lowerNeighbor); + globalMatches.add(new Match(upperNeighbor.startOfFirst(), upperNeighbor.startOfSecond(), lengthUpper + lengthLower)); + globalMatches = removeToken(globalMatches, leftSubmission, rightSubmission, upperNeighbor, tokenBetweenLeft, tokensBetweenRight); neighbors = computeNeighbors(globalMatches); i = 0; } else { @@ -114,30 +114,32 @@ private List mergeNeighbors(List globalMatches, Submission firstSu * This function removes token from both submissions after a merge has been performed. Additionally it moves the * starting positions from matches, that occur after the merged neighboring matches, by the amount of removed token. * @param globalMatches - * @param firstSubmission is the first submission - * @param secondSubmission is the second submission - * @param startFirst begin of the upper neighbor in the first submission - * @param startSecond begin of the upper neighbor in the second submission - * @param lengthUpper length of the upper neighbor - * @param tokensBetweenFirst amount of token that separate the neighboring matches in the first submission and need to - * be removed - * @param tokensBetweenSecond amount token that separate the neighboring matches in the send submission and need to be + * @param leftSubmission is the left submission + * @param rightSubmission is the right submission + * @param upperNeighbor is the upper neighboring match + * @param tokensBetweenLeft amount of token that separate the neighboring matches in the left submission and need to be + * removed + * @param tokensBetweenRight amount token that separate the neighboring matches in the send submission and need to be * removed * @return shiftedMatches with the mentioned changes. */ - private List removeToken(List globalMatches, Submission firstSubmission, Submission secondSubmission, int startFirst, - int startSecond, int lengthUpper, int tokensBetweenFirst, int tokensBetweenSecond) { - List tokenFirst = new ArrayList<>(firstSubmission.getTokenList()); - List tokenSecond = new ArrayList<>(secondSubmission.getTokenList()); - tokenFirst.subList(startFirst + lengthUpper, startFirst + lengthUpper + tokensBetweenFirst).clear(); - tokenSecond.subList(startSecond + lengthUpper, startSecond + lengthUpper + tokensBetweenSecond).clear(); - firstSubmission.setTokenList(tokenFirst); - secondSubmission.setTokenList(tokenSecond); + private List removeToken(List globalMatches, Submission leftSubmission, Submission rightSubmission, Match upperNeighbor, + int tokensBetweenLeft, int tokensBetweenRight) { + int startLeft = upperNeighbor.startOfFirst(); + int startRight = upperNeighbor.startOfSecond(); + int lengthUpper = upperNeighbor.length(); + + List tokenLeft = new ArrayList<>(leftSubmission.getTokenList()); + List tokenRight = new ArrayList<>(rightSubmission.getTokenList()); + tokenLeft.subList(startLeft + lengthUpper, startLeft + lengthUpper + tokensBetweenLeft).clear(); + tokenRight.subList(startRight + lengthUpper, startRight + lengthUpper + tokensBetweenRight).clear(); + leftSubmission.setTokenList(tokenLeft); + rightSubmission.setTokenList(tokenRight); List shiftedMatches = new ArrayList<>(); for (Match match : globalMatches) { - int leftShift = match.startOfFirst() > startFirst ? tokensBetweenFirst : 0; - int rightShift = match.startOfSecond() > startSecond ? tokensBetweenSecond : 0; + int leftShift = match.startOfFirst() > startLeft ? tokensBetweenLeft : 0; + int rightShift = match.startOfSecond() > startRight ? tokensBetweenRight : 0; Match alteredMatch = new Match(match.startOfFirst() - leftShift, match.startOfSecond() - rightShift, match.length()); shiftedMatches.add(alteredMatch); } diff --git a/core/src/main/java/de/jplag/merging/Neighbor.java b/core/src/main/java/de/jplag/merging/Neighbor.java new file mode 100644 index 000000000..91cb93af7 --- /dev/null +++ b/core/src/main/java/de/jplag/merging/Neighbor.java @@ -0,0 +1,9 @@ +package de.jplag.merging; + +import de.jplag.Match; + +/* + * This class realizes a pair of neighboring matches, named upperMatch and lowerMatch + */ +public record Neighbor(Match upperMatch, Match lowerMatch) { +} \ No newline at end of file