diff --git a/src/main/java/emissary/util/ByteUtil.java b/src/main/java/emissary/util/ByteUtil.java index a5de081eac..3f1309b55e 100755 --- a/src/main/java/emissary/util/ByteUtil.java +++ b/src/main/java/emissary/util/ByteUtil.java @@ -1,5 +1,13 @@ package emissary.util; +import org.apache.commons.lang3.StringUtils; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.nio.charset.StandardCharsets; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; import java.util.ArrayList; @@ -333,6 +341,66 @@ public static String sha256Bytes(final byte[] bytes) { } } + /** + * Check if the bytes contains a non-indexable characters + * + * @param utf8Bytes the bytes to be scanned + * @return whether there were non-indexable characters + */ + public static boolean containsNonIndexableBytes(final byte[] utf8Bytes) { + // Wrap the byte array in a ByteArrayInputStream + final InputStream inputStream = new ByteArrayInputStream(utf8Bytes); + return containsNonIndexableBytes(inputStream); + } + + /** + * Check if the input stream contains a non-indexable characters + * + * @param inputStream the input stream to be scanned + * @return whether there were non-indexable characters + */ + public static boolean containsNonIndexableBytes(final InputStream inputStream) { + // Create an InputStreamReader to read the bytes as characters + try (Reader reader = new InputStreamReader(inputStream, StandardCharsets.UTF_8)) { + int codePoint; + // Use the read() method of the InputStreamReader to read code points. The read() method automatically handles + // surrogate pairs, returning a single code point even for characters represented by multiple code units. + while ((codePoint = reader.read()) != -1) { + // Check if the code point is indexable + if (isNotIndexable(codePoint)) { + return true; + } + } + return false; + } catch (IOException e) { + return false; + } + } + + /** + * Check if the code point is a control character or surrogate pair + * Unicode Block + * U0000 + * U2000 + * U3000 + * UFE70 + * UFFF0 + * + * @param codepoint numerical value that maps to a specific character to check + * @return if code-point is a valid text character + */ + private static boolean isNotIndexable(final int codepoint) { + return ('\u0000' <= codepoint && codepoint <= '\u0008') + || ('\u000E' <= codepoint && codepoint <= '\u001F') + || ('\u007F' <= codepoint && codepoint <= '\u009F') + || ('\u2000' <= codepoint && codepoint <= '\u200F') + || ('\u2028' <= codepoint && codepoint <= '\u202F') + || ('\u205F' <= codepoint && codepoint <= '\u206F') + || codepoint == '\u3000' + || codepoint == '\uFEFF' + || codepoint == '\uFFFD'; + } + /** This class is not meant to be instantiated. */ private ByteUtil() {} } diff --git a/src/test/java/emissary/util/ByteUtilTest.java b/src/test/java/emissary/util/ByteUtilTest.java index 2f421d41dd..50c3114a04 100755 --- a/src/test/java/emissary/util/ByteUtilTest.java +++ b/src/test/java/emissary/util/ByteUtilTest.java @@ -4,6 +4,7 @@ import org.junit.jupiter.api.Test; +import java.nio.charset.StandardCharsets; import java.util.List; import static org.junit.jupiter.api.Assertions.assertEquals; @@ -208,4 +209,53 @@ void testGrabLine() { assertEquals("This is line three", ByteUtil.grabLine(data, 35), "Last line extraction"); } + @Test + void testContainsNonIndexableValues() { + String newLineCarriageTab = "This is line one\r\nThis is line two\nThis is line three\n\nEnding with a tab\t"; + assertFalse(ByteUtil.hasNonPrintableValues(newLineCarriageTab.getBytes(StandardCharsets.UTF_8))); + assertFalse(ByteUtil.containsNonIndexableBytes(newLineCarriageTab.getBytes(StandardCharsets.UTF_8))); + + // 2-byte character: € (Euro symbol) + String euro = "€"; + assertEquals("\u20ac", euro); + assertTrue(ByteUtil.hasNonPrintableValues(euro.getBytes(StandardCharsets.UTF_8))); + assertFalse(ByteUtil.containsNonIndexableBytes(euro.getBytes(StandardCharsets.UTF_8))); + + // 3-byte character: (Chinese character for "hello") + String nihao = "你好"; + assertEquals("\u4f60\u597d", nihao); + assertTrue(ByteUtil.hasNonPrintableValues(nihao.getBytes(StandardCharsets.UTF_8))); + assertFalse(ByteUtil.containsNonIndexableBytes(nihao.getBytes(StandardCharsets.UTF_8))); + + // 4-byte character: (Emoji: grinning face) + String emoji = "😊"; + assertEquals("\uD83D\uDE0A", emoji); + assertTrue(ByteUtil.hasNonPrintableValues(emoji.getBytes(StandardCharsets.UTF_8))); + assertFalse(ByteUtil.containsNonIndexableBytes(emoji.getBytes(StandardCharsets.UTF_8))); + + // Unicode value denoting 'null' + String uNull = ""; + assertEquals("\u0000", uNull); + assertTrue(ByteUtil.hasNonPrintableValues(uNull.getBytes(StandardCharsets.UTF_8))); + assertTrue(ByteUtil.containsNonIndexableBytes(uNull.getBytes(StandardCharsets.UTF_8))); + + // Narrow No-Break Space + String nnbsp = " "; + assertEquals("\u202F", nnbsp); + assertTrue(ByteUtil.hasNonPrintableValues(nnbsp.getBytes(StandardCharsets.UTF_8))); + assertTrue(ByteUtil.containsNonIndexableBytes(nnbsp.getBytes(StandardCharsets.UTF_8))); + + // byte order mark + String zwbsp = ""; + assertEquals("\uFEFF", zwbsp); + assertTrue(ByteUtil.hasNonPrintableValues(zwbsp.getBytes(StandardCharsets.UTF_8))); + assertTrue(ByteUtil.containsNonIndexableBytes(zwbsp.getBytes(StandardCharsets.UTF_8))); + + // UTF-8 Error Replacement Character + String rep = "�"; + assertEquals("\uFFFD", rep); + assertTrue(ByteUtil.hasNonPrintableValues(rep.getBytes(StandardCharsets.UTF_8))); + assertTrue(ByteUtil.containsNonIndexableBytes(rep.getBytes(StandardCharsets.UTF_8))); + } + }