From 739b531af528ae4450fa39851e63043cddddf378 Mon Sep 17 00:00:00 2001 From: nickreid Date: Sat, 19 Oct 2024 09:10:25 -0700 Subject: [PATCH] Replace all regexes in TextFormat.Tokenizer with direct char scanning. The JVM regex engine allocates garbage on every match (especially when calling Matcher.usePattern!). Since there are expected to be a lot of tokens, this caused substantial GC overhead. Direct char scanning also opens the possibility of other optimizations that aren't possible with regexes. For example: - direct reads from a char[] - streaming tokenization (rather than reading the complete source text) PiperOrigin-RevId: 687635759 --- .../java/com/google/protobuf/TextFormat.java | 236 +++++++++++++----- .../com/google/protobuf/TextFormatTest.java | 5 + 2 files changed, 176 insertions(+), 65 deletions(-) diff --git a/java/core/src/main/java/com/google/protobuf/TextFormat.java b/java/core/src/main/java/com/google/protobuf/TextFormat.java index 3963f8a8a462..a4dedb2e702d 100644 --- a/java/core/src/main/java/com/google/protobuf/TextFormat.java +++ b/java/core/src/main/java/com/google/protobuf/TextFormat.java @@ -24,7 +24,6 @@ import java.util.Map; import java.util.Set; import java.util.logging.Logger; -import java.util.regex.Matcher; import java.util.regex.Pattern; /** @@ -991,14 +990,9 @@ public void eol() throws IOException { * Scanner} provides no way to inspect the contents of delimiters, making it impossible to * keep track of line and column numbers. * - * - *

Luckily, Java's regular expression support does manage to be useful to us. (Barely: We need - * {@code Matcher.usePattern()}, which is new in Java 1.5.) So, we can use that, at least. - * Unfortunately, this implies that we need to have the entire input in one contiguous string. */ private static final class Tokenizer { private final CharSequence text; - private final Matcher matcher; private String currentToken; // The character index within this.text at which the current token begins. @@ -1007,29 +1001,13 @@ private static final class Tokenizer { // The line and column numbers of the current token. private int line = 0; private int column = 0; + private int lineInfoTrackingPos = 0; // The line and column numbers of the previous token (allows throwing // errors *after* consuming). private int previousLine = 0; private int previousColumn = 0; - // We use possessive quantifiers (*+ and ++) because otherwise the Java - // regex matcher has stack overflows on large inputs. - private static final Pattern WHITESPACE = Pattern.compile("(\\s|(#.*$))++", Pattern.MULTILINE); - private static final Pattern TOKEN = - Pattern.compile( - "[a-zA-Z_][0-9a-zA-Z_+-]*+|" // an identifier - + "[.]?[0-9+-][0-9a-zA-Z_.+-]*+|" // a number - + "\"([^\"\n\\\\]|\\\\.)*+(\"|\\\\?$)|" // a double-quoted string - + "\'([^\'\n\\\\]|\\\\.)*+(\'|\\\\?$)", // a single-quoted string - Pattern.MULTILINE); - - private static final Pattern DOUBLE_INFINITY = - Pattern.compile("-?inf(inity)?", Pattern.CASE_INSENSITIVE); - private static final Pattern FLOAT_INFINITY = - Pattern.compile("-?inf(inity)?f?", Pattern.CASE_INSENSITIVE); - private static final Pattern FLOAT_NAN = Pattern.compile("nanf?", Pattern.CASE_INSENSITIVE); - /** * {@link containsSilentMarkerAfterCurrentToken} indicates if there is a silent marker after the * current token. This value is moved to {@link containsSilentMarkerAfterPrevToken} every time @@ -1042,7 +1020,6 @@ private static final class Tokenizer { /** Construct a tokenizer that parses tokens from the given text. */ private Tokenizer(final CharSequence text) { this.text = text; - this.matcher = WHITESPACE.matcher(text); skipWhitespace(); nextToken(); } @@ -1082,41 +1059,156 @@ void nextToken() { previousColumn = column; // Advance the line counter to the current position. - while (pos < matcher.regionStart()) { - if (text.charAt(pos) == '\n') { + while (lineInfoTrackingPos < pos) { + if (text.charAt(lineInfoTrackingPos) == '\n') { ++line; column = 0; } else { ++column; } - ++pos; + ++lineInfoTrackingPos; } // Match the next token. - if (matcher.regionStart() == matcher.regionEnd()) { - // EOF - currentToken = ""; + if (pos == text.length()) { + currentToken = ""; // EOF } else { - matcher.usePattern(TOKEN); - if (matcher.lookingAt()) { - currentToken = matcher.group(); - matcher.region(matcher.end(), matcher.regionEnd()); - } else { - // Take one character. - currentToken = String.valueOf(text.charAt(pos)); - matcher.region(pos + 1, matcher.regionEnd()); + currentToken = nextTokenInternal(); + skipWhitespace(); + } + } + + private String nextTokenInternal() { + final int textLength = this.text.length(); + final int startPos = this.pos; + final char startChar = this.text.charAt(startPos); + + int endPos = pos; + if (isAlphaUnder(startChar)) { // Identifier + while (++endPos != textLength) { + char c = this.text.charAt(endPos); + if (!(isAlphaUnder(c) || isDigitPlusMinus(c))) { + break; + } } + } else if (isDigitPlusMinus(startChar) || startChar == '.') { // Number + if (startChar == '.') { // Optional leading dot + if (++endPos == textLength) { + return nextTokenSingleChar(); + } - skipWhitespace(); + if (!isDigitPlusMinus(this.text.charAt(endPos))) { // Mandatory first digit + return nextTokenSingleChar(); + } + } + + while (++endPos != textLength) { + char c = this.text.charAt(endPos); + if (!(isDigitPlusMinus(c) || isAlphaUnder(c) || c == '.')) { + break; + } + } + } else if (startChar == '"' || startChar == '\'') { // String + while (++endPos != textLength) { + char c = this.text.charAt(endPos); + if (c == startChar) { + ++endPos; + break; // Quote terminates + } else if (c == '\n') { + break; // Newline terminates (error during parsing) (not consumed) + } else if (c == '\\') { + if (++endPos == textLength) { + break; // Escape into end-of-text terminates (error during parsing) + } else if (this.text.charAt(endPos) == '\n') { + break; // Escape into newline terminates (error during parsing) (not consumed) + } else { + // Otherwise the escaped char is legal and consumed + } + } else { + // Otherwise the char is a legal and consumed + } + } + } else { + return nextTokenSingleChar(); // Unrecognized start character + } + + this.pos = endPos; + return this.text.subSequence(startPos, endPos).toString(); + } + + private static boolean isAlphaUnder(char c) { + // Defining this char-class with numeric comparisons is much faster than using a regex. + return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '_'; + } + + private static boolean isDigitPlusMinus(char c) { + // Defining this char-class with numeric comparisons is much faster than using a regex. + return ('0' <= c && c <= '9') || c == '+' || c == '-'; + } + + private static boolean isWhitespace(char c) { + // Defining this char-class with numeric comparisons is much faster than using a regex. + return c == ' ' || c == '\f' || c == '\n' || c == '\r' || c == '\t'; + } + + /** + * Produce a token for the single char at the current position. + * + *

We hardcode the expected single-char tokens to avoid allocating a unique string every + * time, which is a GC risk. String-literals are always loaded from the class constant pool. + * + *

This method must not be called if the current position is after the end-of-text. + */ + private String nextTokenSingleChar() { + final char c = this.text.charAt(this.pos++); + switch (c) { + case ':': + return ":"; + case ',': + return ","; + case '[': + return "["; + case ']': + return "]"; + case '{': + return "{"; + case '}': + return "}"; + case '<': + return "<"; + case '>': + return ">"; + default: + // If we don't recognize the char, create a string and let the parser report any errors + return String.valueOf(c); } } /** Skip over any whitespace so that the matcher region starts at the next token. */ private void skipWhitespace() { - matcher.usePattern(WHITESPACE); - if (matcher.lookingAt()) { - matcher.region(matcher.end(), matcher.regionEnd()); + final int textLength = this.text.length(); + final int startPos = this.pos; + + int endPos = this.pos - 1; + while (++endPos != textLength) { + char c = this.text.charAt(endPos); + if (c == '#') { + while (++endPos != textLength) { + if (this.text.charAt(endPos) == '\n') { + break; // Consume the newline as whitespace. + } + } + if (endPos == textLength) { + break; + } + } else if (isWhitespace(c)) { + // OK + } else { + break; + } } + + this.pos = endPos; } /** @@ -1148,8 +1240,7 @@ boolean lookingAtInteger() { return false; } - final char c = currentToken.charAt(0); - return ('0' <= c && c <= '9') || c == '-' || c == '+'; + return isDigitPlusMinus(currentToken.charAt(0)); } /** Returns {@code true} if the current token's text is equal to that specified. */ @@ -1164,11 +1255,7 @@ boolean lookingAt(String text) { String consumeIdentifier() throws ParseException { for (int i = 0; i < currentToken.length(); i++) { final char c = currentToken.charAt(i); - if (('a' <= c && c <= 'z') - || ('A' <= c && c <= 'Z') - || ('0' <= c && c <= '9') - || (c == '_') - || (c == '.')) { + if (isAlphaUnder(c) || ('0' <= c && c <= '9') || (c == '.')) { // OK } else { throw parseException("Expected identifier. Found '" + currentToken + "'"); @@ -1282,15 +1369,22 @@ public boolean tryConsumeUInt64() { public double consumeDouble() throws ParseException { // We need to parse infinity and nan separately because // Double.parseDouble() does not accept "inf", "infinity", or "nan". - if (DOUBLE_INFINITY.matcher(currentToken).matches()) { - final boolean negative = currentToken.startsWith("-"); - nextToken(); - return negative ? Double.NEGATIVE_INFINITY : Double.POSITIVE_INFINITY; - } - if (currentToken.equalsIgnoreCase("nan")) { - nextToken(); - return Double.NaN; + switch (currentToken.toLowerCase(Locale.ROOT)) { + case "-inf": + case "-infinity": + nextToken(); + return Double.NEGATIVE_INFINITY; + case "inf": + case "infinity": + nextToken(); + return Double.POSITIVE_INFINITY; + case "nan": + nextToken(); + return Double.NaN; + default: + // fall through } + try { final double result = Double.parseDouble(currentToken); nextToken(); @@ -1320,15 +1414,27 @@ public boolean tryConsumeDouble() { public float consumeFloat() throws ParseException { // We need to parse infinity and nan separately because // Float.parseFloat() does not accept "inf", "infinity", or "nan". - if (FLOAT_INFINITY.matcher(currentToken).matches()) { - final boolean negative = currentToken.startsWith("-"); - nextToken(); - return negative ? Float.NEGATIVE_INFINITY : Float.POSITIVE_INFINITY; - } - if (FLOAT_NAN.matcher(currentToken).matches()) { - nextToken(); - return Float.NaN; + switch (currentToken.toLowerCase(Locale.ROOT)) { + case "-inf": + case "-inff": + case "-infinity": + case "-infinityf": + nextToken(); + return Float.NEGATIVE_INFINITY; + case "inf": + case "inff": + case "infinity": + case "infinityf": + nextToken(); + return Float.POSITIVE_INFINITY; + case "nan": + case "nanf": + nextToken(); + return Float.NaN; + default: + // fall through } + try { final float result = Float.parseFloat(currentToken); nextToken(); diff --git a/java/core/src/test/java/com/google/protobuf/TextFormatTest.java b/java/core/src/test/java/com/google/protobuf/TextFormatTest.java index 4b6509c0db6f..8ae81b9a50b0 100644 --- a/java/core/src/test/java/com/google/protobuf/TextFormatTest.java +++ b/java/core/src/test/java/com/google/protobuf/TextFormatTest.java @@ -81,6 +81,7 @@ public class TextFormatTest { + "repeated_double: 0.125\n" + "repeated_double: .125\n" + "repeated_double: -.125\n" + + "repeated_double: .0\n" + "repeated_double: 1.23E17\n" + "repeated_double: 1.23E+17\n" + "repeated_double: -1.23e-17\n" @@ -314,6 +315,7 @@ public void testPrintExotic() throws Exception { .addRepeatedDouble(0.125) .addRepeatedDouble(.125) .addRepeatedDouble(-.125) + .addRepeatedDouble(.0) .addRepeatedDouble(123e15) .addRepeatedDouble(123e15) .addRepeatedDouble(-1.23e-17) @@ -949,6 +951,9 @@ public void testParseErrors() throws Exception { "1:23: Enum type \"protobuf_unittest.TestAllTypes.NestedEnum\" has no " + "value with number 123.", "optional_nested_enum: 123"); + assertParseError("1:18: Couldn't parse number: For input string: \".\"", "repeated_double: ."); + assertParseError( + "1:18: Couldn't parse number: For input string: \".+\"", "repeated_double: .+"); // Delimiters must match. assertParseError("1:22: Expected identifier. Found '}'", "OptionalGroup < a: 1 }");