Skip to content

Commit

Permalink
Match merging against obfuscation attacks (jplag#1202)
Browse files Browse the repository at this point in the history
Implemented match merging, which is a new defense mechanism against obfuscations like insertions, alterations, and swapping.
  • Loading branch information
uuqjz authored Aug 15, 2023
1 parent b4ab37c commit 2d85029
Show file tree
Hide file tree
Showing 16 changed files with 811 additions and 27 deletions.
8 changes: 7 additions & 1 deletion cli/src/main/java/de/jplag/cli/CLI.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import de.jplag.clustering.ClusteringOptions;
import de.jplag.clustering.Preprocessing;
import de.jplag.exceptions.ExitException;
import de.jplag.merging.MergingParameters;
import de.jplag.options.JPlagOptions;
import de.jplag.options.LanguageOption;
import de.jplag.options.LanguageOptions;
Expand Down Expand Up @@ -165,11 +166,12 @@ public JPlagOptions buildOptionsFromArguments(ParseResult parseResult) throws Cl
}

ClusteringOptions clusteringOptions = getClusteringOptions(this.options);
MergingParameters mergingParameters = getMergingParameters(this.options);

JPlagOptions jPlagOptions = new JPlagOptions(loadLanguage(parseResult), this.options.minTokenMatch, submissionDirectories,
oldSubmissionDirectories, null, this.options.advanced.subdirectory, suffixes, this.options.advanced.exclusionFileName,
JPlagOptions.DEFAULT_SIMILARITY_METRIC, this.options.advanced.similarityThreshold, this.options.shownComparisons, clusteringOptions,
this.options.advanced.debug);
this.options.advanced.debug, mergingParameters);

String baseCodePath = this.options.baseCode;
File baseCodeDirectory = baseCodePath == null ? null : new File(baseCodePath);
Expand Down Expand Up @@ -228,6 +230,10 @@ private static ClusteringOptions getClusteringOptions(CliOptions options) {
return clusteringOptions;
}

private static MergingParameters getMergingParameters(CliOptions options) {
return new MergingParameters(options.merging.enabled, options.merging.mergeBuffer, options.merging.seperatingThreshold);
}

private String generateDescription() {
var randomDescription = DESCRIPTIONS[RANDOM.nextInt(DESCRIPTIONS.length)];
return String.format(DESCRIPTION_PATTERN, randomDescription, CREDITS);
Expand Down
19 changes: 18 additions & 1 deletion cli/src/main/java/de/jplag/cli/CliOptions.java
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,9 @@ public class CliOptions implements Runnable {
@ArgGroup(validate = false, heading = "Clustering%n")
public Clustering clustering = new Clustering();

@ArgGroup(validate = false, heading = "Match Merging defense mechanism against obfuscation that merges neighboring matches based on these parameters:%n")
public Merging merging = new Merging();

/**
* Empty run method, so picocli prints help automatically
*/
Expand Down Expand Up @@ -88,7 +91,7 @@ public static class Advanced {
}

public static class Clustering {
@Option(names = {"--cluster-skip"}, description = "Skips the clustering (default: false)\n")
@Option(names = {"--cluster-skip"}, description = "Skips the clustering (default: false)%n")
public boolean disable;

@ArgGroup
Expand All @@ -109,6 +112,20 @@ public static class ClusteringEnabled {
}
}

public static class Merging {
@Option(names = {"--match-merging"}, description = "Enables match merging (default: false)%n")
public boolean enabled;

@Option(names = {
"--merge-buffer"}, description = "Defines how much lower the length of a match can be than the minimum match length (default: 0)%n")
public int mergeBuffer;

@Option(names = {
"--seperating-threshold"}, description = "Defines how many token there can be between two neighboring matches (default: 0)%n")
public int seperatingThreshold;

}

@Option(names = {"--cluster-spectral-bandwidth"}, hidden = true)
public double clusterSpectralBandwidth = new ClusteringOptions().spectralKernelBandwidth();

Expand Down
15 changes: 11 additions & 4 deletions core/src/main/java/de/jplag/GreedyStringTiling.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,16 @@
public class GreedyStringTiling {

private final int minimumMatchLength;
private final int mergeBuffer;
private ConcurrentMap<TokenType, Integer> tokenTypeValues;
private final Map<Submission, Set<Token>> baseCodeMarkings = new IdentityHashMap<>();

private final Map<Submission, int[]> cachedTokenValueLists = new IdentityHashMap<>();
private final Map<Submission, SubsequenceHashLookupTable> cachedHashLookupTables = new IdentityHashMap<>();

public GreedyStringTiling(JPlagOptions options) {
this.minimumMatchLength = options.minimumTokenMatch();
this.mergeBuffer = options.mergingParameters().mergeBuffer();
this.minimumMatchLength = Math.max(options.minimumTokenMatch() - this.mergeBuffer, 1);
this.tokenTypeValues = new ConcurrentHashMap<>();
this.tokenTypeValues.put(SharedTokenType.FILE_END, 0);
}
Expand Down Expand Up @@ -98,7 +100,7 @@ private JPlagComparison compareInternal(Submission leftSubmission, Submission ri

// comparison uses <= because it is assumed that the last token is a pivot (FILE_END)
if (leftTokens.size() <= minimumMatchLength || rightTokens.size() <= minimumMatchLength) {
return new JPlagComparison(leftSubmission, rightSubmission, List.of());
return new JPlagComparison(leftSubmission, rightSubmission, List.of(), List.of());
}

boolean[] leftMarked = calculateInitiallyMarked(leftSubmission);
Expand All @@ -109,6 +111,7 @@ private JPlagComparison compareInternal(Submission leftSubmission, Submission ri

int maximumMatchLength;
List<Match> globalMatches = new ArrayList<>();
List<Match> ignoredMatches = new ArrayList<>();
do {
maximumMatchLength = minimumMatchLength;
List<Match> iterationMatches = new ArrayList<>();
Expand Down Expand Up @@ -138,7 +141,11 @@ private JPlagComparison compareInternal(Submission leftSubmission, Submission ri
}
}
for (Match match : iterationMatches) {
addMatchIfNotOverlapping(globalMatches, match);
if (match.length() < minimumMatchLength + mergeBuffer) {
addMatchIfNotOverlapping(ignoredMatches, match);
} else {
addMatchIfNotOverlapping(globalMatches, match);
}
int leftStartIndex = match.startOfFirst();
int rightStartIndex = match.startOfSecond();
for (int offset = 0; offset < match.length(); offset++) {
Expand All @@ -147,7 +154,7 @@ private JPlagComparison compareInternal(Submission leftSubmission, Submission ri
}
}
} while (maximumMatchLength != minimumMatchLength);
return new JPlagComparison(leftSubmission, rightSubmission, globalMatches);
return new JPlagComparison(leftSubmission, rightSubmission, globalMatches, ignoredMatches);
}

/**
Expand Down
7 changes: 7 additions & 0 deletions core/src/main/java/de/jplag/JPlag.java
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import de.jplag.clustering.ClusteringFactory;
import de.jplag.exceptions.ExitException;
import de.jplag.exceptions.SubmissionException;
import de.jplag.merging.MatchMerging;
import de.jplag.options.JPlagOptions;
import de.jplag.reporting.reportobject.model.Version;
import de.jplag.strategy.ComparisonStrategy;
Expand Down Expand Up @@ -71,6 +72,12 @@ public static JPlagResult run(JPlagOptions options) throws ExitException {

// Compare valid submissions.
JPlagResult result = comparisonStrategy.compareSubmissions(submissionSet);

// Use Match Merging against obfuscation
if (options.mergingParameters().enabled()) {
result = new MatchMerging(options).mergeMatchesOf(result);
}

if (logger.isInfoEnabled())
logger.info("Total time for comparing submissions: {}", TimeUtil.formatDuration(result.getDuration()));
result.setClusteringResult(ClusteringFactory.getClusterings(result.getAllComparisons(), options.clusteringOptions()));
Expand Down
5 changes: 3 additions & 2 deletions core/src/main/java/de/jplag/JPlagComparison.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,18 @@
* @param secondSubmission is the second of the two submissions.
* @param matches is the unmodifiable list of all matches between the two submissions.
*/
public record JPlagComparison(Submission firstSubmission, Submission secondSubmission, List<Match> matches) {
public record JPlagComparison(Submission firstSubmission, Submission secondSubmission, List<Match> matches, List<Match> ignoredMatches) {
/**
* Initializes a new comparison.
* @param firstSubmission is the first of the two submissions.
* @param secondSubmission is the second of the two submissions.
* @param matches is the list of all matches between the two submissions.
*/
public JPlagComparison(Submission firstSubmission, Submission secondSubmission, List<Match> matches) {
public JPlagComparison(Submission firstSubmission, Submission secondSubmission, List<Match> matches, List<Match> ignoredMatches) {
this.firstSubmission = firstSubmission;
this.secondSubmission = secondSubmission;
this.matches = Collections.unmodifiableList(matches);
this.ignoredMatches = Collections.unmodifiableList(ignoredMatches);
}

/**
Expand Down
10 changes: 10 additions & 0 deletions core/src/main/java/de/jplag/Submission.java
Original file line number Diff line number Diff line change
Expand Up @@ -294,4 +294,14 @@ private List<Integer> getOrder(List<Token> tokenList) {
}
return order;
}

/**
* @return Submission containing shallow copies of its fields.
*/
public Submission copy() {
Submission copy = new Submission(name, submissionRootFile, isNew, files, language);
copy.setTokenList(new ArrayList<>(tokenList));
copy.setBaseCodeComparison(baseCodeComparison);
return copy;
}
}
203 changes: 203 additions & 0 deletions core/src/main/java/de/jplag/merging/MatchMerging.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
package de.jplag.merging;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

import de.jplag.JPlagComparison;
import de.jplag.JPlagResult;
import de.jplag.Match;
import de.jplag.SharedTokenType;
import de.jplag.Submission;
import de.jplag.Token;
import de.jplag.options.JPlagOptions;

/**
* This class implements a match merging algorithm which serves as defense mechanism against obfuscation attacks. Based
* on configurable parameters MergeBuffer and SeperatingThreshold, it alters prior results from pairwise submission
* comparisons and merges all neighboring matches that fit the specified thresholds. Submissions are referred to as left
* and right and neighboring matches as upper and lower. When neighboring matches get merged they become one and the
* tokens separating them get removed from the submission clone. MergeBuffer describes how shorter a match can be than
* the Minimum Token Match. SeperatingThreshold describes how many tokens can be between two neighboring matches. Both
* are set in {@link JPlagOptions} as {@link MergingParameters} and default to 0 (which deactivates merging).
*/
public class MatchMerging {
private JPlagOptions options;

/**
* Instantiates the match merging algorithm for a comparison result and a set of specific options.
* @param options encapsulates the adjustable options
*/
public MatchMerging(JPlagOptions options) {
this.options = options;
}

/**
* Runs the internal match merging pipeline. It computes neighboring matches, merges them based on
* {@link MergingParameters} and removes remaining too short matches afterwards.
* @param result is the initially computed result object
* @return JPlagResult containing the merged matches
*/
public JPlagResult mergeMatchesOf(JPlagResult result) {
long timeBeforeStartInMillis = System.currentTimeMillis();

List<JPlagComparison> comparisons = new ArrayList<>(result.getAllComparisons());
List<JPlagComparison> comparisonsMerged = new ArrayList<>();

for (JPlagComparison comparison : comparisons) {
Submission leftSubmission = comparison.firstSubmission().copy();
Submission rightSubmission = comparison.secondSubmission().copy();
List<Match> globalMatches = new ArrayList<>(comparison.matches());
globalMatches.addAll(comparison.ignoredMatches());
globalMatches = removeTooShortMatches(mergeNeighbors(globalMatches, leftSubmission, rightSubmission));
comparisonsMerged.add(new JPlagComparison(leftSubmission, rightSubmission, globalMatches, new ArrayList<>()));
}

long durationInMillis = System.currentTimeMillis() - timeBeforeStartInMillis;
return new JPlagResult(comparisonsMerged, result.getSubmissions(), result.getDuration() + durationInMillis, options);
}

/**
* Computes neighbors by sorting based on order of matches in the left and right submissions and then checking which are
* next to each other in both.
* @param globalMatches
* @return neighbors containing a list of pairs of neighboring matches
*/
private List<Neighbor> computeNeighbors(List<Match> globalMatches) {
List<Neighbor> neighbors = new ArrayList<>();
List<Match> sortedByLeft = new ArrayList<>(globalMatches);
Collections.sort(sortedByLeft, (match1, match2) -> match1.startOfFirst() - match2.startOfFirst());
List<Match> sortedByRight = new ArrayList<>(globalMatches);
Collections.sort(sortedByRight, (match1, match2) -> match1.startOfSecond() - match2.startOfSecond());
for (int i = 0; i < sortedByLeft.size() - 1; i++) {
if (sortedByRight.indexOf(sortedByLeft.get(i)) == (sortedByRight.indexOf(sortedByLeft.get(i + 1)) - 1)) {
neighbors.add(new Neighbor(sortedByLeft.get(i), sortedByLeft.get(i + 1)));
}
}
return neighbors;
}

/**
* This function iterates through the neighboring matches and checks which fit the merging criteria. Those who do are
* merged and the original matches are removed. This is done, until there are either no neighbors left, or none fit the
* criteria
* @return globalMatches containing merged matches.
*/
private List<Match> mergeNeighbors(List<Match> globalMatches, Submission leftSubmission, Submission rightSubmission) {
int i = 0;
List<Neighbor> neighbors = computeNeighbors(globalMatches);

while (i < neighbors.size()) {
Match upperNeighbor = neighbors.get(i).upperMatch();
Match lowerNeighbor = neighbors.get(i).lowerMatch();

int lengthUpper = upperNeighbor.length();
int lengthLower = lowerNeighbor.length();
int tokenBetweenLeft = lowerNeighbor.startOfFirst() - upperNeighbor.endOfFirst() - 1;
int tokensBetweenRight = lowerNeighbor.startOfSecond() - upperNeighbor.endOfSecond() - 1;
double averageTokensBetweenMatches = (tokenBetweenLeft + tokensBetweenRight) / 2.0;
// Checking length is not necessary as GST already checked length while computing matches
if (averageTokensBetweenMatches <= options.mergingParameters().seperatingThreshold()
&& !mergeOverlapsFiles(leftSubmission, rightSubmission, upperNeighbor, tokenBetweenLeft, tokensBetweenRight)) {
globalMatches.remove(upperNeighbor);
globalMatches.remove(lowerNeighbor);
globalMatches.add(new Match(upperNeighbor.startOfFirst(), upperNeighbor.startOfSecond(), lengthUpper + lengthLower));
globalMatches = removeToken(globalMatches, leftSubmission, rightSubmission, upperNeighbor, tokenBetweenLeft, tokensBetweenRight);
neighbors = computeNeighbors(globalMatches);
i = 0;
} else {
i++;
}
}
return globalMatches;
}

/**
* This function checks if a merge would go over file boundaries.
* @param leftSubmission is the left submission
* @param rightSubmission is the right submission
* @param upperNeighbor is the upper neighboring match
* @param tokensBetweenLeft amount of token that separate the neighboring matches in the left submission and need to be
* removed
* @param tokensBetweenRight amount token that separate the neighboring matches in the send submission and need to be
* removed
* @return true if the merge goes over file boundaries.
*/
private boolean mergeOverlapsFiles(Submission leftSubmission, Submission rightSubmission, Match upperNeighbor, int tokensBetweenLeft,
int tokensBetweenRight) {
if (leftSubmission.getFiles().size() == 1 && rightSubmission.getFiles().size() == 1) {
return false;
}
int startLeft = upperNeighbor.startOfFirst();
int startRight = upperNeighbor.startOfSecond();
int lengthUpper = upperNeighbor.length();

List<Token> tokenLeft = new ArrayList<>(leftSubmission.getTokenList());
List<Token> tokenRight = new ArrayList<>(rightSubmission.getTokenList());
tokenLeft = tokenLeft.subList(startLeft + lengthUpper, startLeft + lengthUpper + tokensBetweenLeft);
tokenRight = tokenRight.subList(startRight + lengthUpper, startRight + lengthUpper + tokensBetweenRight);

return containsFileEndToken(tokenLeft) || containsFileEndToken(tokenRight);
}

/**
* This function checks whether a list of token contains FILE_END
* @param token is the list of token
* @return true if FILE_END is in token
*/
private boolean containsFileEndToken(List<Token> token) {
return token.stream().map(Token::getType).anyMatch(it -> it.equals(SharedTokenType.FILE_END));
}

/**
* This function removes token from both submissions after a merge has been performed. Additionally it moves the
* starting positions from matches, that occur after the merged neighboring matches, by the amount of removed token.
* @param globalMatches
* @param leftSubmission is the left submission
* @param rightSubmission is the right submission
* @param upperNeighbor is the upper neighboring match
* @param tokensBetweenLeft amount of token that separate the neighboring matches in the left submission and need to be
* removed
* @param tokensBetweenRight amount token that separate the neighboring matches in the send submission and need to be
* removed
* @return shiftedMatches with the mentioned changes.
*/
private List<Match> removeToken(List<Match> globalMatches, Submission leftSubmission, Submission rightSubmission, Match upperNeighbor,
int tokensBetweenLeft, int tokensBetweenRight) {
int startLeft = upperNeighbor.startOfFirst();
int startRight = upperNeighbor.startOfSecond();
int lengthUpper = upperNeighbor.length();

List<Token> tokenLeft = new ArrayList<>(leftSubmission.getTokenList());
List<Token> tokenRight = new ArrayList<>(rightSubmission.getTokenList());
tokenLeft.subList(startLeft + lengthUpper, startLeft + lengthUpper + tokensBetweenLeft).clear();
tokenRight.subList(startRight + lengthUpper, startRight + lengthUpper + tokensBetweenRight).clear();
leftSubmission.setTokenList(tokenLeft);
rightSubmission.setTokenList(tokenRight);

List<Match> shiftedMatches = new ArrayList<>();
for (Match match : globalMatches) {
int leftShift = match.startOfFirst() > startLeft ? tokensBetweenLeft : 0;
int rightShift = match.startOfSecond() > startRight ? tokensBetweenRight : 0;
Match alteredMatch = new Match(match.startOfFirst() - leftShift, match.startOfSecond() - rightShift, match.length());
shiftedMatches.add(alteredMatch);
}

return shiftedMatches;
}

/**
* This method marks the end of the merging pipeline and removes the remaining too short matches from
* @param globalMatches
*/
private List<Match> removeTooShortMatches(List<Match> globalMatches) {
List<Match> toRemove = new ArrayList<>();
for (Match match : globalMatches) {
if (match.length() < options.minimumTokenMatch()) {
toRemove.add(match);
}
}
globalMatches.removeAll(toRemove);
return globalMatches;
}
}
Loading

0 comments on commit 2d85029

Please sign in to comment.