Replace all regexes in TextFormat.Tokenizer with direct char scanning.

The JVM regex engine allocates garbage on every match (especially when calling Matcher.usePattern!). Since there are expected to be a lot of tokens, this caused substantial GC overhead. Direct char scanning also opens the possibility of other optimizations that aren't possible with regexes. For example: - direct reads from a char[] - streaming tokenization (rather than reading the complete source text) PiperOrigin-RevId: 687635759
protocolbuffers · Oct 24, 2024 · 739b531 · 739b531
1 parent 47613cf
commit 739b531
Show file tree

Hide file tree

Showing 2 changed files with 176 additions and 65 deletions.
diff --git a/java/core/src/main/java/com/google/protobuf/TextFormat.java b/java/core/src/main/java/com/google/protobuf/TextFormat.java
@@ -24,7 +24,6 @@
 import java.util.Map;
 import java.util.Set;
 import java.util.logging.Logger;
-import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
 /**
@@ -991,14 +990,9 @@ public void eol() throws IOException {
    *       Scanner} provides no way to inspect the contents of delimiters, making it impossible to
    *       keep track of line and column numbers.
    * </ul>
-   *
-   * <p>Luckily, Java's regular expression support does manage to be useful to us. (Barely: We need
-   * {@code Matcher.usePattern()}, which is new in Java 1.5.) So, we can use that, at least.
-   * Unfortunately, this implies that we need to have the entire input in one contiguous string.
    */
   private static final class Tokenizer {
     private final CharSequence text;
-    private final Matcher matcher;
     private String currentToken;
 
     // The character index within this.text at which the current token begins.
@@ -1007,29 +1001,13 @@ private static final class Tokenizer {
     // The line and column numbers of the current token.
     private int line = 0;
     private int column = 0;
+    private int lineInfoTrackingPos = 0;
 
     // The line and column numbers of the previous token (allows throwing
     // errors *after* consuming).
     private int previousLine = 0;
     private int previousColumn = 0;
 
-    // We use possessive quantifiers (*+ and ++) because otherwise the Java
-    // regex matcher has stack overflows on large inputs.
-    private static final Pattern WHITESPACE = Pattern.compile("(\\s|(#.*$))++", Pattern.MULTILINE);
-    private static final Pattern TOKEN =
-        Pattern.compile(
-            "[a-zA-Z_][0-9a-zA-Z_+-]*+|" // an identifier
-                + "[.]?[0-9+-][0-9a-zA-Z_.+-]*+|" // a number
-                + "\"([^\"\n\\\\]|\\\\.)*+(\"|\\\\?$)|" // a double-quoted string
-                + "\'([^\'\n\\\\]|\\\\.)*+(\'|\\\\?$)", // a single-quoted string
-            Pattern.MULTILINE);
-
-    private static final Pattern DOUBLE_INFINITY =
-        Pattern.compile("-?inf(inity)?", Pattern.CASE_INSENSITIVE);
-    private static final Pattern FLOAT_INFINITY =
-        Pattern.compile("-?inf(inity)?f?", Pattern.CASE_INSENSITIVE);
-    private static final Pattern FLOAT_NAN = Pattern.compile("nanf?", Pattern.CASE_INSENSITIVE);
-
     /**
      * {@link containsSilentMarkerAfterCurrentToken} indicates if there is a silent marker after the
      * current token. This value is moved to {@link containsSilentMarkerAfterPrevToken} every time
@@ -1042,7 +1020,6 @@ private static final class Tokenizer {
     /** Construct a tokenizer that parses tokens from the given text. */
     private Tokenizer(final CharSequence text) {
       this.text = text;
-      this.matcher = WHITESPACE.matcher(text);
       skipWhitespace();
       nextToken();
     }
@@ -1082,41 +1059,156 @@ void nextToken() {
       previousColumn = column;
 
       // Advance the line counter to the current position.
-      while (pos < matcher.regionStart()) {
-        if (text.charAt(pos) == '\n') {
+      while (lineInfoTrackingPos < pos) {
+        if (text.charAt(lineInfoTrackingPos) == '\n') {
           ++line;
           column = 0;
         } else {
           ++column;
         }
-        ++pos;
+        ++lineInfoTrackingPos;
       }
 
       // Match the next token.
-      if (matcher.regionStart() == matcher.regionEnd()) {
-        // EOF
-        currentToken = "";
+      if (pos == text.length()) {
+        currentToken = ""; // EOF
       } else {
-        matcher.usePattern(TOKEN);
-        if (matcher.lookingAt()) {
-          currentToken = matcher.group();
-          matcher.region(matcher.end(), matcher.regionEnd());
-        } else {
-          // Take one character.
-          currentToken = String.valueOf(text.charAt(pos));
-          matcher.region(pos + 1, matcher.regionEnd());
+        currentToken = nextTokenInternal();
+        skipWhitespace();
+      }
+    }
+
+    private String nextTokenInternal() {
+      final int textLength = this.text.length();
+      final int startPos = this.pos;
+      final char startChar = this.text.charAt(startPos);
+
+      int endPos = pos;
+      if (isAlphaUnder(startChar)) { // Identifier
+        while (++endPos != textLength) {
+          char c = this.text.charAt(endPos);
+          if (!(isAlphaUnder(c) || isDigitPlusMinus(c))) {
+            break;
+          }
         }
+      } else if (isDigitPlusMinus(startChar) || startChar == '.') { // Number
+        if (startChar == '.') { // Optional leading dot
+          if (++endPos == textLength) {
+            return nextTokenSingleChar();
+          }
 
-        skipWhitespace();
+          if (!isDigitPlusMinus(this.text.charAt(endPos))) { // Mandatory first digit
+            return nextTokenSingleChar();
+          }
+        }
+
+        while (++endPos != textLength) {
+          char c = this.text.charAt(endPos);
+          if (!(isDigitPlusMinus(c) || isAlphaUnder(c) || c == '.')) {
+            break;
+          }
+        }
+      } else if (startChar == '"' || startChar == '\'') { // String
+        while (++endPos != textLength) {
+          char c = this.text.charAt(endPos);
+          if (c == startChar) {
+            ++endPos;
+            break; // Quote terminates
+          } else if (c == '\n') {
+            break; // Newline terminates (error during parsing) (not consumed)
+          } else if (c == '\\') {
+            if (++endPos == textLength) {
+              break; // Escape into end-of-text terminates (error during parsing)
+            } else if (this.text.charAt(endPos) == '\n') {
+              break; // Escape into newline terminates (error during parsing) (not consumed)
+            } else {
+              // Otherwise the escaped char is legal and consumed
+            }
+          } else {
+            // Otherwise the char is a legal and consumed
+          }
+        }
+      } else {
+        return nextTokenSingleChar(); // Unrecognized start character
+      }
+
+      this.pos = endPos;
+      return this.text.subSequence(startPos, endPos).toString();
+    }
+
+    private static boolean isAlphaUnder(char c) {
+      // Defining this char-class with numeric comparisons is much faster than using a regex.
+      return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '_';
+    }
+
+    private static boolean isDigitPlusMinus(char c) {
+      // Defining this char-class with numeric comparisons is much faster than using a regex.
+      return ('0' <= c && c <= '9') || c == '+' || c == '-';
+    }
+
+    private static boolean isWhitespace(char c) {
+      // Defining this char-class with numeric comparisons is much faster than using a regex.
+      return c == ' ' || c == '\f' || c == '\n' || c == '\r' || c == '\t';
+    }
+
+    /**
+     * Produce a token for the single char at the current position.
+     *
+     * <p>We hardcode the expected single-char tokens to avoid allocating a unique string every
+     * time, which is a GC risk. String-literals are always loaded from the class constant pool.
+     *
+     * <p>This method must not be called if the current position is after the end-of-text.
+     */
+    private String nextTokenSingleChar() {
+      final char c = this.text.charAt(this.pos++);
+      switch (c) {
+        case ':':
+          return ":";
+        case ',':
+          return ",";
+        case '[':
+          return "[";
+        case ']':
+          return "]";
+        case '{':
+          return "{";
+        case '}':
+          return "}";
+        case '<':
+          return "<";
+        case '>':
+          return ">";
+        default:
+          // If we don't recognize the char, create a string and let the parser report any errors
+          return String.valueOf(c);
       }
     }
 
     /** Skip over any whitespace so that the matcher region starts at the next token. */
     private void skipWhitespace() {
-      matcher.usePattern(WHITESPACE);
-      if (matcher.lookingAt()) {
-        matcher.region(matcher.end(), matcher.regionEnd());
+      final int textLength = this.text.length();
+      final int startPos = this.pos;
+
+      int endPos = this.pos - 1;
+      while (++endPos != textLength) {
+        char c = this.text.charAt(endPos);
+        if (c == '#') {
+          while (++endPos != textLength) {
+            if (this.text.charAt(endPos) == '\n') {
+              break; // Consume the newline as whitespace.
+            }
+          }
+          if (endPos == textLength) {
+            break;
+          }
+        } else if (isWhitespace(c)) {
+          // OK
+        } else {
+          break;
+        }
       }
+
+      this.pos = endPos;
     }
 
     /**
@@ -1148,8 +1240,7 @@ boolean lookingAtInteger() {
         return false;
       }
 
-      final char c = currentToken.charAt(0);
-      return ('0' <= c && c <= '9') || c == '-' || c == '+';
+      return isDigitPlusMinus(currentToken.charAt(0));
     }
 
     /** Returns {@code true} if the current token's text is equal to that specified. */
@@ -1164,11 +1255,7 @@ boolean lookingAt(String text) {
     String consumeIdentifier() throws ParseException {
       for (int i = 0; i < currentToken.length(); i++) {
         final char c = currentToken.charAt(i);
-        if (('a' <= c && c <= 'z')
-            || ('A' <= c && c <= 'Z')
-            || ('0' <= c && c <= '9')
-            || (c == '_')
-            || (c == '.')) {
+        if (isAlphaUnder(c) || ('0' <= c && c <= '9') || (c == '.')) {
           // OK
         } else {
           throw parseException("Expected identifier. Found '" + currentToken + "'");
@@ -1282,15 +1369,22 @@ public boolean tryConsumeUInt64() {
     public double consumeDouble() throws ParseException {
       // We need to parse infinity and nan separately because
       // Double.parseDouble() does not accept "inf", "infinity", or "nan".
-      if (DOUBLE_INFINITY.matcher(currentToken).matches()) {
-        final boolean negative = currentToken.startsWith("-");
-        nextToken();
-        return negative ? Double.NEGATIVE_INFINITY : Double.POSITIVE_INFINITY;
-      }
-      if (currentToken.equalsIgnoreCase("nan")) {
-        nextToken();
-        return Double.NaN;
+      switch (currentToken.toLowerCase(Locale.ROOT)) {
+        case "-inf":
+        case "-infinity":
+          nextToken();
+          return Double.NEGATIVE_INFINITY;
+        case "inf":
+        case "infinity":
+          nextToken();
+          return Double.POSITIVE_INFINITY;
+        case "nan":
+          nextToken();
+          return Double.NaN;
+        default:
+          // fall through
       }
+
       try {
         final double result = Double.parseDouble(currentToken);
         nextToken();
@@ -1320,15 +1414,27 @@ public boolean tryConsumeDouble() {
     public float consumeFloat() throws ParseException {
       // We need to parse infinity and nan separately because
       // Float.parseFloat() does not accept "inf", "infinity", or "nan".
-      if (FLOAT_INFINITY.matcher(currentToken).matches()) {
-        final boolean negative = currentToken.startsWith("-");
-        nextToken();
-        return negative ? Float.NEGATIVE_INFINITY : Float.POSITIVE_INFINITY;
-      }
-      if (FLOAT_NAN.matcher(currentToken).matches()) {
-        nextToken();
-        return Float.NaN;
+      switch (currentToken.toLowerCase(Locale.ROOT)) {
+        case "-inf":
+        case "-inff":
+        case "-infinity":
+        case "-infinityf":
+          nextToken();
+          return Float.NEGATIVE_INFINITY;
+        case "inf":
+        case "inff":
+        case "infinity":
+        case "infinityf":
+          nextToken();
+          return Float.POSITIVE_INFINITY;
+        case "nan":
+        case "nanf":
+          nextToken();
+          return Float.NaN;
+        default:
+          // fall through
       }
+
       try {
         final float result = Float.parseFloat(currentToken);
         nextToken();

diff --git a/java/core/src/test/java/com/google/protobuf/TextFormatTest.java b/java/core/src/test/java/com/google/protobuf/TextFormatTest.java
@@ -81,6 +81,7 @@ public class TextFormatTest {
           + "repeated_double: 0.125\n"
           + "repeated_double: .125\n"
           + "repeated_double: -.125\n"
+          + "repeated_double: .0\n"
           + "repeated_double: 1.23E17\n"
           + "repeated_double: 1.23E+17\n"
           + "repeated_double: -1.23e-17\n"
@@ -314,6 +315,7 @@ public void testPrintExotic() throws Exception {
             .addRepeatedDouble(0.125)
             .addRepeatedDouble(.125)
             .addRepeatedDouble(-.125)
+            .addRepeatedDouble(.0)
             .addRepeatedDouble(123e15)
             .addRepeatedDouble(123e15)
             .addRepeatedDouble(-1.23e-17)
@@ -949,6 +951,9 @@ public void testParseErrors() throws Exception {
         "1:23: Enum type \"protobuf_unittest.TestAllTypes.NestedEnum\" has no "
             + "value with number 123.",
         "optional_nested_enum: 123");
+    assertParseError("1:18: Couldn't parse number: For input string: \".\"", "repeated_double: .");
+    assertParseError(
+        "1:18: Couldn't parse number: For input string: \".+\"", "repeated_double: .+");
 
     // Delimiters must match.
     assertParseError("1:22: Expected identifier. Found '}'", "OptionalGroup < a: 1 }");