From 40b2d499faf7a4ccc2c1625a3c847441161dd0f9 Mon Sep 17 00:00:00 2001
From: dev-mlb <19797865+dev-mlb@users.noreply.github.com>
Date: Thu, 14 Nov 2024 21:53:28 -0500
Subject: [PATCH] byteutil :: add support to check for non-indexable chars
(#1010)
---
src/main/java/emissary/util/ByteUtil.java | 66 +++++++++++++++++++
src/test/java/emissary/util/ByteUtilTest.java | 50 ++++++++++++++
2 files changed, 116 insertions(+)
diff --git a/src/main/java/emissary/util/ByteUtil.java b/src/main/java/emissary/util/ByteUtil.java
index a5de081eac..f7be2ee248 100755
--- a/src/main/java/emissary/util/ByteUtil.java
+++ b/src/main/java/emissary/util/ByteUtil.java
@@ -1,5 +1,11 @@
package emissary.util;
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.nio.charset.StandardCharsets;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
@@ -333,6 +339,66 @@ public static String sha256Bytes(final byte[] bytes) {
}
}
+ /**
+ * Check if the bytes contains a non-indexable characters
+ *
+ * @param utf8Bytes the bytes to be scanned
+ * @return whether there were non-indexable characters
+ */
+ public static boolean containsNonIndexableBytes(final byte[] utf8Bytes) {
+ // Wrap the byte array in a ByteArrayInputStream
+ final InputStream inputStream = new ByteArrayInputStream(utf8Bytes);
+ return containsNonIndexableBytes(inputStream);
+ }
+
+ /**
+ * Check if the input stream contains a non-indexable characters
+ *
+ * @param inputStream the input stream to be scanned
+ * @return whether there were non-indexable characters
+ */
+ public static boolean containsNonIndexableBytes(final InputStream inputStream) {
+ // Create an InputStreamReader to read the bytes as characters
+ try (Reader reader = new InputStreamReader(inputStream, StandardCharsets.UTF_8)) {
+ int codePoint;
+ // Use the read() method of the InputStreamReader to read code points. The read() method automatically handles
+ // surrogate pairs, returning a single code point even for characters represented by multiple code units.
+ while ((codePoint = reader.read()) != -1) {
+ // Check if the code point is indexable
+ if (isNotIndexable(codePoint)) {
+ return true;
+ }
+ }
+ return false;
+ } catch (IOException e) {
+ return true;
+ }
+ }
+
+ /**
+ * Check if the code point is a control character or surrogate pair
+ * Unicode Block
+ * U0000
+ * U2000
+ * U3000
+ * UFE70
+ * UFFF0
+ *
+ * @param codepoint numerical value that maps to a specific character to check
+ * @return if code-point is a valid text character
+ */
+ private static boolean isNotIndexable(final int codepoint) {
+ return ('\u0000' <= codepoint && codepoint <= '\u0008')
+ || ('\u000E' <= codepoint && codepoint <= '\u001F')
+ || ('\u007F' <= codepoint && codepoint <= '\u009F')
+ || ('\u2000' <= codepoint && codepoint <= '\u200F')
+ || ('\u2028' <= codepoint && codepoint <= '\u202F')
+ || ('\u205F' <= codepoint && codepoint <= '\u206F')
+ || codepoint == '\u3000'
+ || codepoint == '\uFEFF'
+ || codepoint == '\uFFFD';
+ }
+
/** This class is not meant to be instantiated. */
private ByteUtil() {}
}
diff --git a/src/test/java/emissary/util/ByteUtilTest.java b/src/test/java/emissary/util/ByteUtilTest.java
index 2f421d41dd..50c3114a04 100755
--- a/src/test/java/emissary/util/ByteUtilTest.java
+++ b/src/test/java/emissary/util/ByteUtilTest.java
@@ -4,6 +4,7 @@
import org.junit.jupiter.api.Test;
+import java.nio.charset.StandardCharsets;
import java.util.List;
import static org.junit.jupiter.api.Assertions.assertEquals;
@@ -208,4 +209,53 @@ void testGrabLine() {
assertEquals("This is line three", ByteUtil.grabLine(data, 35), "Last line extraction");
}
+ @Test
+ void testContainsNonIndexableValues() {
+ String newLineCarriageTab = "This is line one\r\nThis is line two\nThis is line three\n\nEnding with a tab\t";
+ assertFalse(ByteUtil.hasNonPrintableValues(newLineCarriageTab.getBytes(StandardCharsets.UTF_8)));
+ assertFalse(ByteUtil.containsNonIndexableBytes(newLineCarriageTab.getBytes(StandardCharsets.UTF_8)));
+
+ // 2-byte character: € (Euro symbol)
+ String euro = "€";
+ assertEquals("\u20ac", euro);
+ assertTrue(ByteUtil.hasNonPrintableValues(euro.getBytes(StandardCharsets.UTF_8)));
+ assertFalse(ByteUtil.containsNonIndexableBytes(euro.getBytes(StandardCharsets.UTF_8)));
+
+ // 3-byte character: (Chinese character for "hello")
+ String nihao = "你好";
+ assertEquals("\u4f60\u597d", nihao);
+ assertTrue(ByteUtil.hasNonPrintableValues(nihao.getBytes(StandardCharsets.UTF_8)));
+ assertFalse(ByteUtil.containsNonIndexableBytes(nihao.getBytes(StandardCharsets.UTF_8)));
+
+ // 4-byte character: (Emoji: grinning face)
+ String emoji = "😊";
+ assertEquals("\uD83D\uDE0A", emoji);
+ assertTrue(ByteUtil.hasNonPrintableValues(emoji.getBytes(StandardCharsets.UTF_8)));
+ assertFalse(ByteUtil.containsNonIndexableBytes(emoji.getBytes(StandardCharsets.UTF_8)));
+
+ // Unicode value denoting 'null'
+ String uNull = " ";
+ assertEquals("\u0000", uNull);
+ assertTrue(ByteUtil.hasNonPrintableValues(uNull.getBytes(StandardCharsets.UTF_8)));
+ assertTrue(ByteUtil.containsNonIndexableBytes(uNull.getBytes(StandardCharsets.UTF_8)));
+
+ // Narrow No-Break Space
+ String nnbsp = " ";
+ assertEquals("\u202F", nnbsp);
+ assertTrue(ByteUtil.hasNonPrintableValues(nnbsp.getBytes(StandardCharsets.UTF_8)));
+ assertTrue(ByteUtil.containsNonIndexableBytes(nnbsp.getBytes(StandardCharsets.UTF_8)));
+
+ // byte order mark
+ String zwbsp = "";
+ assertEquals("\uFEFF", zwbsp);
+ assertTrue(ByteUtil.hasNonPrintableValues(zwbsp.getBytes(StandardCharsets.UTF_8)));
+ assertTrue(ByteUtil.containsNonIndexableBytes(zwbsp.getBytes(StandardCharsets.UTF_8)));
+
+ // UTF-8 Error Replacement Character
+ String rep = "�";
+ assertEquals("\uFFFD", rep);
+ assertTrue(ByteUtil.hasNonPrintableValues(rep.getBytes(StandardCharsets.UTF_8)));
+ assertTrue(ByteUtil.containsNonIndexableBytes(rep.getBytes(StandardCharsets.UTF_8)));
+ }
+
}