Match merging against obfuscation attacks (jplag#1202)

Implemented match merging, which is a new defense mechanism against obfuscations like insertions, alterations, and swapping.
uuqjz · Aug 15, 2023 · 2d85029 · 2d85029
1 parent b4ab37c
commit 2d85029
Show file tree

Hide file tree

Showing 16 changed files with 811 additions and 27 deletions.
diff --git a/cli/src/main/java/de/jplag/cli/CLI.java b/cli/src/main/java/de/jplag/cli/CLI.java
@@ -23,6 +23,7 @@
 import de.jplag.clustering.ClusteringOptions;
 import de.jplag.clustering.Preprocessing;
 import de.jplag.exceptions.ExitException;
+import de.jplag.merging.MergingParameters;
 import de.jplag.options.JPlagOptions;
 import de.jplag.options.LanguageOption;
 import de.jplag.options.LanguageOptions;
@@ -165,11 +166,12 @@ public JPlagOptions buildOptionsFromArguments(ParseResult parseResult) throws Cl
         }
 
         ClusteringOptions clusteringOptions = getClusteringOptions(this.options);
+        MergingParameters mergingParameters = getMergingParameters(this.options);
 
         JPlagOptions jPlagOptions = new JPlagOptions(loadLanguage(parseResult), this.options.minTokenMatch, submissionDirectories,
                 oldSubmissionDirectories, null, this.options.advanced.subdirectory, suffixes, this.options.advanced.exclusionFileName,
                 JPlagOptions.DEFAULT_SIMILARITY_METRIC, this.options.advanced.similarityThreshold, this.options.shownComparisons, clusteringOptions,
-                this.options.advanced.debug);
+                this.options.advanced.debug, mergingParameters);
 
         String baseCodePath = this.options.baseCode;
         File baseCodeDirectory = baseCodePath == null ? null : new File(baseCodePath);
@@ -228,6 +230,10 @@ private static ClusteringOptions getClusteringOptions(CliOptions options) {
         return clusteringOptions;
     }
 
+    private static MergingParameters getMergingParameters(CliOptions options) {
+        return new MergingParameters(options.merging.enabled, options.merging.mergeBuffer, options.merging.seperatingThreshold);
+    }
+
     private String generateDescription() {
         var randomDescription = DESCRIPTIONS[RANDOM.nextInt(DESCRIPTIONS.length)];
         return String.format(DESCRIPTION_PATTERN, randomDescription, CREDITS);

diff --git a/cli/src/main/java/de/jplag/cli/CliOptions.java b/cli/src/main/java/de/jplag/cli/CliOptions.java
@@ -59,6 +59,9 @@ public class CliOptions implements Runnable {
     @ArgGroup(validate = false, heading = "Clustering%n")
     public Clustering clustering = new Clustering();
 
+    @ArgGroup(validate = false, heading = "Match Merging defense mechanism against obfuscation that merges neighboring matches based on these parameters:%n")
+    public Merging merging = new Merging();
+
     /**
      * Empty run method, so picocli prints help automatically
      */
@@ -88,7 +91,7 @@ public static class Advanced {
     }
 
     public static class Clustering {
-        @Option(names = {"--cluster-skip"}, description = "Skips the clustering (default: false)\n")
+        @Option(names = {"--cluster-skip"}, description = "Skips the clustering (default: false)%n")
         public boolean disable;
 
         @ArgGroup
@@ -109,6 +112,20 @@ public static class ClusteringEnabled {
         }
     }
 
+    public static class Merging {
+        @Option(names = {"--match-merging"}, description = "Enables match merging (default: false)%n")
+        public boolean enabled;
+
+        @Option(names = {
+                "--merge-buffer"}, description = "Defines how much lower the length of a match can be than the minimum match length (default: 0)%n")
+        public int mergeBuffer;
+
+        @Option(names = {
+                "--seperating-threshold"}, description = "Defines how many token there can be between two neighboring matches (default: 0)%n")
+        public int seperatingThreshold;
+
+    }
+
     @Option(names = {"--cluster-spectral-bandwidth"}, hidden = true)
     public double clusterSpectralBandwidth = new ClusteringOptions().spectralKernelBandwidth();
 

diff --git a/core/src/main/java/de/jplag/GreedyStringTiling.java b/core/src/main/java/de/jplag/GreedyStringTiling.java
@@ -22,14 +22,16 @@
 public class GreedyStringTiling {
 
     private final int minimumMatchLength;
+    private final int mergeBuffer;
     private ConcurrentMap<TokenType, Integer> tokenTypeValues;
     private final Map<Submission, Set<Token>> baseCodeMarkings = new IdentityHashMap<>();
 
     private final Map<Submission, int[]> cachedTokenValueLists = new IdentityHashMap<>();
     private final Map<Submission, SubsequenceHashLookupTable> cachedHashLookupTables = new IdentityHashMap<>();
 
     public GreedyStringTiling(JPlagOptions options) {
-        this.minimumMatchLength = options.minimumTokenMatch();
+        this.mergeBuffer = options.mergingParameters().mergeBuffer();
+        this.minimumMatchLength = Math.max(options.minimumTokenMatch() - this.mergeBuffer, 1);
         this.tokenTypeValues = new ConcurrentHashMap<>();
         this.tokenTypeValues.put(SharedTokenType.FILE_END, 0);
     }
@@ -98,7 +100,7 @@ private JPlagComparison compareInternal(Submission leftSubmission, Submission ri
 
         // comparison uses <= because it is assumed that the last token is a pivot (FILE_END)
         if (leftTokens.size() <= minimumMatchLength || rightTokens.size() <= minimumMatchLength) {
-            return new JPlagComparison(leftSubmission, rightSubmission, List.of());
+            return new JPlagComparison(leftSubmission, rightSubmission, List.of(), List.of());
         }
 
         boolean[] leftMarked = calculateInitiallyMarked(leftSubmission);
@@ -109,6 +111,7 @@ private JPlagComparison compareInternal(Submission leftSubmission, Submission ri
 
         int maximumMatchLength;
         List<Match> globalMatches = new ArrayList<>();
+        List<Match> ignoredMatches = new ArrayList<>();
         do {
             maximumMatchLength = minimumMatchLength;
             List<Match> iterationMatches = new ArrayList<>();
@@ -138,7 +141,11 @@ private JPlagComparison compareInternal(Submission leftSubmission, Submission ri
                 }
             }
             for (Match match : iterationMatches) {
-                addMatchIfNotOverlapping(globalMatches, match);
+                if (match.length() < minimumMatchLength + mergeBuffer) {
+                    addMatchIfNotOverlapping(ignoredMatches, match);
+                } else {
+                    addMatchIfNotOverlapping(globalMatches, match);
+                }
                 int leftStartIndex = match.startOfFirst();
                 int rightStartIndex = match.startOfSecond();
                 for (int offset = 0; offset < match.length(); offset++) {
@@ -147,7 +154,7 @@ private JPlagComparison compareInternal(Submission leftSubmission, Submission ri
                 }
             }
         } while (maximumMatchLength != minimumMatchLength);
-        return new JPlagComparison(leftSubmission, rightSubmission, globalMatches);
+        return new JPlagComparison(leftSubmission, rightSubmission, globalMatches, ignoredMatches);
     }
 
     /**

diff --git a/core/src/main/java/de/jplag/JPlag.java b/core/src/main/java/de/jplag/JPlag.java
@@ -10,6 +10,7 @@
 import de.jplag.clustering.ClusteringFactory;
 import de.jplag.exceptions.ExitException;
 import de.jplag.exceptions.SubmissionException;
+import de.jplag.merging.MatchMerging;
 import de.jplag.options.JPlagOptions;
 import de.jplag.reporting.reportobject.model.Version;
 import de.jplag.strategy.ComparisonStrategy;
@@ -71,6 +72,12 @@ public static JPlagResult run(JPlagOptions options) throws ExitException {
 
         // Compare valid submissions.
         JPlagResult result = comparisonStrategy.compareSubmissions(submissionSet);
+
+        // Use Match Merging against obfuscation
+        if (options.mergingParameters().enabled()) {
+            result = new MatchMerging(options).mergeMatchesOf(result);
+        }
+
         if (logger.isInfoEnabled())
             logger.info("Total time for comparing submissions: {}", TimeUtil.formatDuration(result.getDuration()));
         result.setClusteringResult(ClusteringFactory.getClusterings(result.getAllComparisons(), options.clusteringOptions()));

diff --git a/core/src/main/java/de/jplag/JPlagComparison.java b/core/src/main/java/de/jplag/JPlagComparison.java
@@ -9,17 +9,18 @@
  * @param secondSubmission is the second of the two submissions.
  * @param matches is the unmodifiable list of all matches between the two submissions.
  */
-public record JPlagComparison(Submission firstSubmission, Submission secondSubmission, List<Match> matches) {
+public record JPlagComparison(Submission firstSubmission, Submission secondSubmission, List<Match> matches, List<Match> ignoredMatches) {
     /**
      * Initializes a new comparison.
      * @param firstSubmission is the first of the two submissions.
      * @param secondSubmission is the second of the two submissions.
      * @param matches is the list of all matches between the two submissions.
      */
-    public JPlagComparison(Submission firstSubmission, Submission secondSubmission, List<Match> matches) {
+    public JPlagComparison(Submission firstSubmission, Submission secondSubmission, List<Match> matches, List<Match> ignoredMatches) {
         this.firstSubmission = firstSubmission;
         this.secondSubmission = secondSubmission;
         this.matches = Collections.unmodifiableList(matches);
+        this.ignoredMatches = Collections.unmodifiableList(ignoredMatches);
     }
 
     /**

diff --git a/core/src/main/java/de/jplag/Submission.java b/core/src/main/java/de/jplag/Submission.java
@@ -294,4 +294,14 @@ private List<Integer> getOrder(List<Token> tokenList) {
         }
         return order;
     }
+
+    /**
+     * @return Submission containing shallow copies of its fields.
+     */
+    public Submission copy() {
+        Submission copy = new Submission(name, submissionRootFile, isNew, files, language);
+        copy.setTokenList(new ArrayList<>(tokenList));
+        copy.setBaseCodeComparison(baseCodeComparison);
+        return copy;
+    }
 }
diff --git a/core/src/main/java/de/jplag/merging/MatchMerging.java b/core/src/main/java/de/jplag/merging/MatchMerging.java
@@ -0,0 +1,203 @@
+package de.jplag.merging;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+import de.jplag.JPlagComparison;
+import de.jplag.JPlagResult;
+import de.jplag.Match;
+import de.jplag.SharedTokenType;
+import de.jplag.Submission;
+import de.jplag.Token;
+import de.jplag.options.JPlagOptions;
+
+/**
+ * This class implements a match merging algorithm which serves as defense mechanism against obfuscation attacks. Based
+ * on configurable parameters MergeBuffer and SeperatingThreshold, it alters prior results from pairwise submission
+ * comparisons and merges all neighboring matches that fit the specified thresholds. Submissions are referred to as left
+ * and right and neighboring matches as upper and lower. When neighboring matches get merged they become one and the
+ * tokens separating them get removed from the submission clone. MergeBuffer describes how shorter a match can be than
+ * the Minimum Token Match. SeperatingThreshold describes how many tokens can be between two neighboring matches. Both
+ * are set in {@link JPlagOptions} as {@link MergingParameters} and default to 0 (which deactivates merging).
+ */
+public class MatchMerging {
+    private JPlagOptions options;
+
+    /**
+     * Instantiates the match merging algorithm for a comparison result and a set of specific options.
+     * @param options encapsulates the adjustable options
+     */
+    public MatchMerging(JPlagOptions options) {
+        this.options = options;
+    }
+
+    /**
+     * Runs the internal match merging pipeline. It computes neighboring matches, merges them based on
+     * {@link MergingParameters} and removes remaining too short matches afterwards.
+     * @param result is the initially computed result object
+     * @return JPlagResult containing the merged matches
+     */
+    public JPlagResult mergeMatchesOf(JPlagResult result) {
+        long timeBeforeStartInMillis = System.currentTimeMillis();
+
+        List<JPlagComparison> comparisons = new ArrayList<>(result.getAllComparisons());
+        List<JPlagComparison> comparisonsMerged = new ArrayList<>();
+
+        for (JPlagComparison comparison : comparisons) {
+            Submission leftSubmission = comparison.firstSubmission().copy();
+            Submission rightSubmission = comparison.secondSubmission().copy();
+            List<Match> globalMatches = new ArrayList<>(comparison.matches());
+            globalMatches.addAll(comparison.ignoredMatches());
+            globalMatches = removeTooShortMatches(mergeNeighbors(globalMatches, leftSubmission, rightSubmission));
+            comparisonsMerged.add(new JPlagComparison(leftSubmission, rightSubmission, globalMatches, new ArrayList<>()));
+        }
+
+        long durationInMillis = System.currentTimeMillis() - timeBeforeStartInMillis;
+        return new JPlagResult(comparisonsMerged, result.getSubmissions(), result.getDuration() + durationInMillis, options);
+    }
+
+    /**
+     * Computes neighbors by sorting based on order of matches in the left and right submissions and then checking which are
+     * next to each other in both.
+     * @param globalMatches
+     * @return neighbors containing a list of pairs of neighboring matches
+     */
+    private List<Neighbor> computeNeighbors(List<Match> globalMatches) {
+        List<Neighbor> neighbors = new ArrayList<>();
+        List<Match> sortedByLeft = new ArrayList<>(globalMatches);
+        Collections.sort(sortedByLeft, (match1, match2) -> match1.startOfFirst() - match2.startOfFirst());
+        List<Match> sortedByRight = new ArrayList<>(globalMatches);
+        Collections.sort(sortedByRight, (match1, match2) -> match1.startOfSecond() - match2.startOfSecond());
+        for (int i = 0; i < sortedByLeft.size() - 1; i++) {
+            if (sortedByRight.indexOf(sortedByLeft.get(i)) == (sortedByRight.indexOf(sortedByLeft.get(i + 1)) - 1)) {
+                neighbors.add(new Neighbor(sortedByLeft.get(i), sortedByLeft.get(i + 1)));
+            }
+        }
+        return neighbors;
+    }
+
+    /**
+     * This function iterates through the neighboring matches and checks which fit the merging criteria. Those who do are
+     * merged and the original matches are removed. This is done, until there are either no neighbors left, or none fit the
+     * criteria
+     * @return globalMatches containing merged matches.
+     */
+    private List<Match> mergeNeighbors(List<Match> globalMatches, Submission leftSubmission, Submission rightSubmission) {
+        int i = 0;
+        List<Neighbor> neighbors = computeNeighbors(globalMatches);
+
+        while (i < neighbors.size()) {
+            Match upperNeighbor = neighbors.get(i).upperMatch();
+            Match lowerNeighbor = neighbors.get(i).lowerMatch();
+
+            int lengthUpper = upperNeighbor.length();
+            int lengthLower = lowerNeighbor.length();
+            int tokenBetweenLeft = lowerNeighbor.startOfFirst() - upperNeighbor.endOfFirst() - 1;
+            int tokensBetweenRight = lowerNeighbor.startOfSecond() - upperNeighbor.endOfSecond() - 1;
+            double averageTokensBetweenMatches = (tokenBetweenLeft + tokensBetweenRight) / 2.0;
+            // Checking length is not necessary as GST already checked length while computing matches
+            if (averageTokensBetweenMatches <= options.mergingParameters().seperatingThreshold()
+                    && !mergeOverlapsFiles(leftSubmission, rightSubmission, upperNeighbor, tokenBetweenLeft, tokensBetweenRight)) {
+                globalMatches.remove(upperNeighbor);
+                globalMatches.remove(lowerNeighbor);
+                globalMatches.add(new Match(upperNeighbor.startOfFirst(), upperNeighbor.startOfSecond(), lengthUpper + lengthLower));
+                globalMatches = removeToken(globalMatches, leftSubmission, rightSubmission, upperNeighbor, tokenBetweenLeft, tokensBetweenRight);
+                neighbors = computeNeighbors(globalMatches);
+                i = 0;
+            } else {
+                i++;
+            }
+        }
+        return globalMatches;
+    }
+
+    /**
+     * This function checks if a merge would go over file boundaries.
+     * @param leftSubmission is the left submission
+     * @param rightSubmission is the right submission
+     * @param upperNeighbor is the upper neighboring match
+     * @param tokensBetweenLeft amount of token that separate the neighboring matches in the left submission and need to be
+     * removed
+     * @param tokensBetweenRight amount token that separate the neighboring matches in the send submission and need to be
+     * removed
+     * @return true if the merge goes over file boundaries.
+     */
+    private boolean mergeOverlapsFiles(Submission leftSubmission, Submission rightSubmission, Match upperNeighbor, int tokensBetweenLeft,
+            int tokensBetweenRight) {
+        if (leftSubmission.getFiles().size() == 1 && rightSubmission.getFiles().size() == 1) {
+            return false;
+        }
+        int startLeft = upperNeighbor.startOfFirst();
+        int startRight = upperNeighbor.startOfSecond();
+        int lengthUpper = upperNeighbor.length();
+
+        List<Token> tokenLeft = new ArrayList<>(leftSubmission.getTokenList());
+        List<Token> tokenRight = new ArrayList<>(rightSubmission.getTokenList());
+        tokenLeft = tokenLeft.subList(startLeft + lengthUpper, startLeft + lengthUpper + tokensBetweenLeft);
+        tokenRight = tokenRight.subList(startRight + lengthUpper, startRight + lengthUpper + tokensBetweenRight);
+
+        return containsFileEndToken(tokenLeft) || containsFileEndToken(tokenRight);
+    }
+
+    /**
+     * This function checks whether a list of token contains FILE_END
+     * @param token is the list of token
+     * @return true if FILE_END is in token
+     */
+    private boolean containsFileEndToken(List<Token> token) {
+        return token.stream().map(Token::getType).anyMatch(it -> it.equals(SharedTokenType.FILE_END));
+    }
+
+    /**
+     * This function removes token from both submissions after a merge has been performed. Additionally it moves the
+     * starting positions from matches, that occur after the merged neighboring matches, by the amount of removed token.
+     * @param globalMatches
+     * @param leftSubmission is the left submission
+     * @param rightSubmission is the right submission
+     * @param upperNeighbor is the upper neighboring match
+     * @param tokensBetweenLeft amount of token that separate the neighboring matches in the left submission and need to be
+     * removed
+     * @param tokensBetweenRight amount token that separate the neighboring matches in the send submission and need to be
+     * removed
+     * @return shiftedMatches with the mentioned changes.
+     */
+    private List<Match> removeToken(List<Match> globalMatches, Submission leftSubmission, Submission rightSubmission, Match upperNeighbor,
+            int tokensBetweenLeft, int tokensBetweenRight) {
+        int startLeft = upperNeighbor.startOfFirst();
+        int startRight = upperNeighbor.startOfSecond();
+        int lengthUpper = upperNeighbor.length();
+
+        List<Token> tokenLeft = new ArrayList<>(leftSubmission.getTokenList());
+        List<Token> tokenRight = new ArrayList<>(rightSubmission.getTokenList());
+        tokenLeft.subList(startLeft + lengthUpper, startLeft + lengthUpper + tokensBetweenLeft).clear();
+        tokenRight.subList(startRight + lengthUpper, startRight + lengthUpper + tokensBetweenRight).clear();
+        leftSubmission.setTokenList(tokenLeft);
+        rightSubmission.setTokenList(tokenRight);
+
+        List<Match> shiftedMatches = new ArrayList<>();
+        for (Match match : globalMatches) {
+            int leftShift = match.startOfFirst() > startLeft ? tokensBetweenLeft : 0;
+            int rightShift = match.startOfSecond() > startRight ? tokensBetweenRight : 0;
+            Match alteredMatch = new Match(match.startOfFirst() - leftShift, match.startOfSecond() - rightShift, match.length());
+            shiftedMatches.add(alteredMatch);
+        }
+
+        return shiftedMatches;
+    }
+
+    /**
+     * This method marks the end of the merging pipeline and removes the remaining too short matches from
+     * @param globalMatches
+     */
+    private List<Match> removeTooShortMatches(List<Match> globalMatches) {
+        List<Match> toRemove = new ArrayList<>();
+        for (Match match : globalMatches) {
+            if (match.length() < options.minimumTokenMatch()) {
+                toRemove.add(match);
+            }
+        }
+        globalMatches.removeAll(toRemove);
+        return globalMatches;
+    }
+}