Skip to content

Commit

Permalink
byteutil :: add support to check for non-indexable chars (#1010)
Browse files Browse the repository at this point in the history
  • Loading branch information
dev-mlb authored Nov 15, 2024
1 parent bb507fa commit 40b2d49
Show file tree
Hide file tree
Showing 2 changed files with 116 additions and 0 deletions.
66 changes: 66 additions & 0 deletions src/main/java/emissary/util/ByteUtil.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
package emissary.util;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
Expand Down Expand Up @@ -333,6 +339,66 @@ public static String sha256Bytes(final byte[] bytes) {
}
}

/**
* Check if the bytes contains a non-indexable characters
*
* @param utf8Bytes the bytes to be scanned
* @return whether there were non-indexable characters
*/
public static boolean containsNonIndexableBytes(final byte[] utf8Bytes) {
// Wrap the byte array in a ByteArrayInputStream
final InputStream inputStream = new ByteArrayInputStream(utf8Bytes);
return containsNonIndexableBytes(inputStream);
}

/**
* Check if the input stream contains a non-indexable characters
*
* @param inputStream the input stream to be scanned
* @return whether there were non-indexable characters
*/
public static boolean containsNonIndexableBytes(final InputStream inputStream) {
// Create an InputStreamReader to read the bytes as characters
try (Reader reader = new InputStreamReader(inputStream, StandardCharsets.UTF_8)) {
int codePoint;
// Use the read() method of the InputStreamReader to read code points. The read() method automatically handles
// surrogate pairs, returning a single code point even for characters represented by multiple code units.
while ((codePoint = reader.read()) != -1) {
// Check if the code point is indexable
if (isNotIndexable(codePoint)) {
return true;
}
}
return false;
} catch (IOException e) {
return true;
}
}

/**
* Check if the code point is a control character or surrogate pair
* <a href="https://en.wikipedia.org/wiki/Unicode_block">Unicode Block</a>
* <a href="https://www.unicode.org/charts/PDF/U0000.pdf">U0000</a>
* <a href="https://www.unicode.org/charts/PDF/U2000.pdf">U2000</a>
* <a href="https://www.unicode.org/charts/PDF/U3000.pdf">U3000</a>
* <a href="https://www.unicode.org/charts/PDF/UFE70.pdf">UFE70</a>
* <a href="https://www.unicode.org/charts/PDF/UFFF0.pdf">UFFF0</a>
*
* @param codepoint numerical value that maps to a specific character to check
* @return if code-point is a valid text character
*/
private static boolean isNotIndexable(final int codepoint) {
return ('\u0000' <= codepoint && codepoint <= '\u0008')
|| ('\u000E' <= codepoint && codepoint <= '\u001F')
|| ('\u007F' <= codepoint && codepoint <= '\u009F')
|| ('\u2000' <= codepoint && codepoint <= '\u200F')
|| ('\u2028' <= codepoint && codepoint <= '\u202F')
|| ('\u205F' <= codepoint && codepoint <= '\u206F')
|| codepoint == '\u3000'
|| codepoint == '\uFEFF'
|| codepoint == '\uFFFD';
}

/** This class is not meant to be instantiated. */
private ByteUtil() {}
}
50 changes: 50 additions & 0 deletions src/test/java/emissary/util/ByteUtilTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import org.junit.jupiter.api.Test;

import java.nio.charset.StandardCharsets;
import java.util.List;

import static org.junit.jupiter.api.Assertions.assertEquals;
Expand Down Expand Up @@ -208,4 +209,53 @@ void testGrabLine() {
assertEquals("This is line three", ByteUtil.grabLine(data, 35), "Last line extraction");
}

@Test
void testContainsNonIndexableValues() {
String newLineCarriageTab = "This is line one\r\nThis is line two\nThis is line three\n\nEnding with a tab\t";
assertFalse(ByteUtil.hasNonPrintableValues(newLineCarriageTab.getBytes(StandardCharsets.UTF_8)));
assertFalse(ByteUtil.containsNonIndexableBytes(newLineCarriageTab.getBytes(StandardCharsets.UTF_8)));

// 2-byte character: € (Euro symbol)
String euro = "€";
assertEquals("\u20ac", euro);
assertTrue(ByteUtil.hasNonPrintableValues(euro.getBytes(StandardCharsets.UTF_8)));
assertFalse(ByteUtil.containsNonIndexableBytes(euro.getBytes(StandardCharsets.UTF_8)));

// 3-byte character: (Chinese character for "hello")
String nihao = "你好";
assertEquals("\u4f60\u597d", nihao);
assertTrue(ByteUtil.hasNonPrintableValues(nihao.getBytes(StandardCharsets.UTF_8)));
assertFalse(ByteUtil.containsNonIndexableBytes(nihao.getBytes(StandardCharsets.UTF_8)));

// 4-byte character: (Emoji: grinning face)
String emoji = "😊";
assertEquals("\uD83D\uDE0A", emoji);
assertTrue(ByteUtil.hasNonPrintableValues(emoji.getBytes(StandardCharsets.UTF_8)));
assertFalse(ByteUtil.containsNonIndexableBytes(emoji.getBytes(StandardCharsets.UTF_8)));

// Unicode value denoting 'null'
String uNull = "";
assertEquals("\u0000", uNull);
assertTrue(ByteUtil.hasNonPrintableValues(uNull.getBytes(StandardCharsets.UTF_8)));
assertTrue(ByteUtil.containsNonIndexableBytes(uNull.getBytes(StandardCharsets.UTF_8)));

// Narrow No-Break Space
String nnbsp = " ";
assertEquals("\u202F", nnbsp);
assertTrue(ByteUtil.hasNonPrintableValues(nnbsp.getBytes(StandardCharsets.UTF_8)));
assertTrue(ByteUtil.containsNonIndexableBytes(nnbsp.getBytes(StandardCharsets.UTF_8)));

// byte order mark
String zwbsp = "";
assertEquals("\uFEFF", zwbsp);
assertTrue(ByteUtil.hasNonPrintableValues(zwbsp.getBytes(StandardCharsets.UTF_8)));
assertTrue(ByteUtil.containsNonIndexableBytes(zwbsp.getBytes(StandardCharsets.UTF_8)));

// UTF-8 Error Replacement Character
String rep = "�";
assertEquals("\uFFFD", rep);
assertTrue(ByteUtil.hasNonPrintableValues(rep.getBytes(StandardCharsets.UTF_8)));
assertTrue(ByteUtil.containsNonIndexableBytes(rep.getBytes(StandardCharsets.UTF_8)));
}

}

0 comments on commit 40b2d49

Please sign in to comment.