byteutil :: add support to check for non-indexable chars (#1010)

NationalSecurityAgency · Nov 15, 2024 · 40b2d49 · 40b2d49
1 parent bb507fa
commit 40b2d49
Show file tree

Hide file tree

Showing 2 changed files with 116 additions and 0 deletions.
diff --git a/src/main/java/emissary/util/ByteUtil.java b/src/main/java/emissary/util/ByteUtil.java
@@ -1,5 +1,11 @@
 package emissary.util;
 
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.nio.charset.StandardCharsets;
 import java.security.MessageDigest;
 import java.security.NoSuchAlgorithmException;
 import java.util.ArrayList;
@@ -333,6 +339,66 @@ public static String sha256Bytes(final byte[] bytes) {
         }
     }
 
+    /**
+     * Check if the bytes contains a non-indexable characters
+     *
+     * @param utf8Bytes the bytes to be scanned
+     * @return whether there were non-indexable characters
+     */
+    public static boolean containsNonIndexableBytes(final byte[] utf8Bytes) {
+        // Wrap the byte array in a ByteArrayInputStream
+        final InputStream inputStream = new ByteArrayInputStream(utf8Bytes);
+        return containsNonIndexableBytes(inputStream);
+    }
+
+    /**
+     * Check if the input stream contains a non-indexable characters
+     *
+     * @param inputStream the input stream to be scanned
+     * @return whether there were non-indexable characters
+     */
+    public static boolean containsNonIndexableBytes(final InputStream inputStream) {
+        // Create an InputStreamReader to read the bytes as characters
+        try (Reader reader = new InputStreamReader(inputStream, StandardCharsets.UTF_8)) {
+            int codePoint;
+            // Use the read() method of the InputStreamReader to read code points. The read() method automatically handles
+            // surrogate pairs, returning a single code point even for characters represented by multiple code units.
+            while ((codePoint = reader.read()) != -1) {
+                // Check if the code point is indexable
+                if (isNotIndexable(codePoint)) {
+                    return true;
+                }
+            }
+            return false;
+        } catch (IOException e) {
+            return true;
+        }
+    }
+
+    /**
+     * Check if the code point is a control character or surrogate pair
+     * <a href="https://en.wikipedia.org/wiki/Unicode_block">Unicode Block</a>
+     * <a href="https://www.unicode.org/charts/PDF/U0000.pdf">U0000</a>
+     * <a href="https://www.unicode.org/charts/PDF/U2000.pdf">U2000</a>
+     * <a href="https://www.unicode.org/charts/PDF/U3000.pdf">U3000</a>
+     * <a href="https://www.unicode.org/charts/PDF/UFE70.pdf">UFE70</a>
+     * <a href="https://www.unicode.org/charts/PDF/UFFF0.pdf">UFFF0</a>
+     *
+     * @param codepoint numerical value that maps to a specific character to check
+     * @return if code-point is a valid text character
+     */
+    private static boolean isNotIndexable(final int codepoint) {
+        return ('\u0000' <= codepoint && codepoint <= '\u0008')
+                || ('\u000E' <= codepoint && codepoint <= '\u001F')
+                || ('\u007F' <= codepoint && codepoint <= '\u009F')
+                || ('\u2000' <= codepoint && codepoint <= '\u200F')
+                || ('\u2028' <= codepoint && codepoint <= '\u202F')
+                || ('\u205F' <= codepoint && codepoint <= '\u206F')
+                || codepoint == '\u3000'
+                || codepoint == '\uFEFF'
+                || codepoint == '\uFFFD';
+    }
+
     /** This class is not meant to be instantiated. */
     private ByteUtil() {}
 }
diff --git a/src/test/java/emissary/util/ByteUtilTest.java b/src/test/java/emissary/util/ByteUtilTest.java
@@ -4,6 +4,7 @@
 
 import org.junit.jupiter.api.Test;
 
+import java.nio.charset.StandardCharsets;
 import java.util.List;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
@@ -208,4 +209,53 @@ void testGrabLine() {
         assertEquals("This is line three", ByteUtil.grabLine(data, 35), "Last line extraction");
     }
 
+    @Test
+    void testContainsNonIndexableValues() {
+        String newLineCarriageTab = "This is line one\r\nThis is line two\nThis is line three\n\nEnding with a tab\t";
+        assertFalse(ByteUtil.hasNonPrintableValues(newLineCarriageTab.getBytes(StandardCharsets.UTF_8)));
+        assertFalse(ByteUtil.containsNonIndexableBytes(newLineCarriageTab.getBytes(StandardCharsets.UTF_8)));
+
+        // 2-byte character: € (Euro symbol)
+        String euro = "€";
+        assertEquals("\u20ac", euro);
+        assertTrue(ByteUtil.hasNonPrintableValues(euro.getBytes(StandardCharsets.UTF_8)));
+        assertFalse(ByteUtil.containsNonIndexableBytes(euro.getBytes(StandardCharsets.UTF_8)));
+
+        // 3-byte character: (Chinese character for "hello")
+        String nihao = "你好";
+        assertEquals("\u4f60\u597d", nihao);
+        assertTrue(ByteUtil.hasNonPrintableValues(nihao.getBytes(StandardCharsets.UTF_8)));
+        assertFalse(ByteUtil.containsNonIndexableBytes(nihao.getBytes(StandardCharsets.UTF_8)));
+
+        // 4-byte character: (Emoji: grinning face)
+        String emoji = "😊";
+        assertEquals("\uD83D\uDE0A", emoji);
+        assertTrue(ByteUtil.hasNonPrintableValues(emoji.getBytes(StandardCharsets.UTF_8)));
+        assertFalse(ByteUtil.containsNonIndexableBytes(emoji.getBytes(StandardCharsets.UTF_8)));
+
+        // Unicode value denoting 'null'
+        String uNull = "";
+        assertEquals("\u0000", uNull);
+        assertTrue(ByteUtil.hasNonPrintableValues(uNull.getBytes(StandardCharsets.UTF_8)));
+        assertTrue(ByteUtil.containsNonIndexableBytes(uNull.getBytes(StandardCharsets.UTF_8)));
+
+        // Narrow No-Break Space
+        String nnbsp = " ";
+        assertEquals("\u202F", nnbsp);
+        assertTrue(ByteUtil.hasNonPrintableValues(nnbsp.getBytes(StandardCharsets.UTF_8)));
+        assertTrue(ByteUtil.containsNonIndexableBytes(nnbsp.getBytes(StandardCharsets.UTF_8)));
+
+        // byte order mark
+        String zwbsp = "";
+        assertEquals("\uFEFF", zwbsp);
+        assertTrue(ByteUtil.hasNonPrintableValues(zwbsp.getBytes(StandardCharsets.UTF_8)));
+        assertTrue(ByteUtil.containsNonIndexableBytes(zwbsp.getBytes(StandardCharsets.UTF_8)));
+
+        // UTF-8 Error Replacement Character
+        String rep = "�";
+        assertEquals("\uFFFD", rep);
+        assertTrue(ByteUtil.hasNonPrintableValues(rep.getBytes(StandardCharsets.UTF_8)));
+        assertTrue(ByteUtil.containsNonIndexableBytes(rep.getBytes(StandardCharsets.UTF_8)));
+    }
+
 }