Skip to content

Commit

Permalink
Replace all regexes in TextFormat.Tokenizer with direct char scanning.
Browse files Browse the repository at this point in the history
The JVM regex engine allocates garbage on every match (especially when calling Matcher.usePattern!). Since there are expected to be a lot of tokens, this caused substantial GC overhead.

Direct char scanning also opens the possibility of other optimizations that aren't possible with regexes. For example:
  - direct reads from a char[]
  - streaming tokenization (rather than reading the complete source text)

PiperOrigin-RevId: 687635759
  • Loading branch information
nreid260 authored and copybara-github committed Oct 24, 2024
1 parent 47613cf commit 739b531
Show file tree
Hide file tree
Showing 2 changed files with 176 additions and 65 deletions.
236 changes: 171 additions & 65 deletions java/core/src/main/java/com/google/protobuf/TextFormat.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
import java.util.Map;
import java.util.Set;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
Expand Down Expand Up @@ -991,14 +990,9 @@ public void eol() throws IOException {
* Scanner} provides no way to inspect the contents of delimiters, making it impossible to
* keep track of line and column numbers.
* </ul>
*
* <p>Luckily, Java's regular expression support does manage to be useful to us. (Barely: We need
* {@code Matcher.usePattern()}, which is new in Java 1.5.) So, we can use that, at least.
* Unfortunately, this implies that we need to have the entire input in one contiguous string.
*/
private static final class Tokenizer {
private final CharSequence text;
private final Matcher matcher;
private String currentToken;

// The character index within this.text at which the current token begins.
Expand All @@ -1007,29 +1001,13 @@ private static final class Tokenizer {
// The line and column numbers of the current token.
private int line = 0;
private int column = 0;
private int lineInfoTrackingPos = 0;

// The line and column numbers of the previous token (allows throwing
// errors *after* consuming).
private int previousLine = 0;
private int previousColumn = 0;

// We use possessive quantifiers (*+ and ++) because otherwise the Java
// regex matcher has stack overflows on large inputs.
private static final Pattern WHITESPACE = Pattern.compile("(\\s|(#.*$))++", Pattern.MULTILINE);
private static final Pattern TOKEN =
Pattern.compile(
"[a-zA-Z_][0-9a-zA-Z_+-]*+|" // an identifier
+ "[.]?[0-9+-][0-9a-zA-Z_.+-]*+|" // a number
+ "\"([^\"\n\\\\]|\\\\.)*+(\"|\\\\?$)|" // a double-quoted string
+ "\'([^\'\n\\\\]|\\\\.)*+(\'|\\\\?$)", // a single-quoted string
Pattern.MULTILINE);

private static final Pattern DOUBLE_INFINITY =
Pattern.compile("-?inf(inity)?", Pattern.CASE_INSENSITIVE);
private static final Pattern FLOAT_INFINITY =
Pattern.compile("-?inf(inity)?f?", Pattern.CASE_INSENSITIVE);
private static final Pattern FLOAT_NAN = Pattern.compile("nanf?", Pattern.CASE_INSENSITIVE);

/**
* {@link containsSilentMarkerAfterCurrentToken} indicates if there is a silent marker after the
* current token. This value is moved to {@link containsSilentMarkerAfterPrevToken} every time
Expand All @@ -1042,7 +1020,6 @@ private static final class Tokenizer {
/** Construct a tokenizer that parses tokens from the given text. */
private Tokenizer(final CharSequence text) {
this.text = text;
this.matcher = WHITESPACE.matcher(text);
skipWhitespace();
nextToken();
}
Expand Down Expand Up @@ -1082,41 +1059,156 @@ void nextToken() {
previousColumn = column;

// Advance the line counter to the current position.
while (pos < matcher.regionStart()) {
if (text.charAt(pos) == '\n') {
while (lineInfoTrackingPos < pos) {
if (text.charAt(lineInfoTrackingPos) == '\n') {
++line;
column = 0;
} else {
++column;
}
++pos;
++lineInfoTrackingPos;
}

// Match the next token.
if (matcher.regionStart() == matcher.regionEnd()) {
// EOF
currentToken = "";
if (pos == text.length()) {
currentToken = ""; // EOF
} else {
matcher.usePattern(TOKEN);
if (matcher.lookingAt()) {
currentToken = matcher.group();
matcher.region(matcher.end(), matcher.regionEnd());
} else {
// Take one character.
currentToken = String.valueOf(text.charAt(pos));
matcher.region(pos + 1, matcher.regionEnd());
currentToken = nextTokenInternal();
skipWhitespace();
}
}

private String nextTokenInternal() {
final int textLength = this.text.length();
final int startPos = this.pos;
final char startChar = this.text.charAt(startPos);

int endPos = pos;
if (isAlphaUnder(startChar)) { // Identifier
while (++endPos != textLength) {
char c = this.text.charAt(endPos);
if (!(isAlphaUnder(c) || isDigitPlusMinus(c))) {
break;
}
}
} else if (isDigitPlusMinus(startChar) || startChar == '.') { // Number
if (startChar == '.') { // Optional leading dot
if (++endPos == textLength) {
return nextTokenSingleChar();
}

skipWhitespace();
if (!isDigitPlusMinus(this.text.charAt(endPos))) { // Mandatory first digit
return nextTokenSingleChar();
}
}

while (++endPos != textLength) {
char c = this.text.charAt(endPos);
if (!(isDigitPlusMinus(c) || isAlphaUnder(c) || c == '.')) {
break;
}
}
} else if (startChar == '"' || startChar == '\'') { // String
while (++endPos != textLength) {
char c = this.text.charAt(endPos);
if (c == startChar) {
++endPos;
break; // Quote terminates
} else if (c == '\n') {
break; // Newline terminates (error during parsing) (not consumed)
} else if (c == '\\') {
if (++endPos == textLength) {
break; // Escape into end-of-text terminates (error during parsing)
} else if (this.text.charAt(endPos) == '\n') {
break; // Escape into newline terminates (error during parsing) (not consumed)
} else {
// Otherwise the escaped char is legal and consumed
}
} else {
// Otherwise the char is a legal and consumed
}
}
} else {
return nextTokenSingleChar(); // Unrecognized start character
}

this.pos = endPos;
return this.text.subSequence(startPos, endPos).toString();
}

private static boolean isAlphaUnder(char c) {
// Defining this char-class with numeric comparisons is much faster than using a regex.
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '_';
}

private static boolean isDigitPlusMinus(char c) {
// Defining this char-class with numeric comparisons is much faster than using a regex.
return ('0' <= c && c <= '9') || c == '+' || c == '-';
}

private static boolean isWhitespace(char c) {
// Defining this char-class with numeric comparisons is much faster than using a regex.
return c == ' ' || c == '\f' || c == '\n' || c == '\r' || c == '\t';
}

/**
* Produce a token for the single char at the current position.
*
* <p>We hardcode the expected single-char tokens to avoid allocating a unique string every
* time, which is a GC risk. String-literals are always loaded from the class constant pool.
*
* <p>This method must not be called if the current position is after the end-of-text.
*/
private String nextTokenSingleChar() {
final char c = this.text.charAt(this.pos++);
switch (c) {
case ':':
return ":";
case ',':
return ",";
case '[':
return "[";
case ']':
return "]";
case '{':
return "{";
case '}':
return "}";
case '<':
return "<";
case '>':
return ">";
default:
// If we don't recognize the char, create a string and let the parser report any errors
return String.valueOf(c);
}
}

/** Skip over any whitespace so that the matcher region starts at the next token. */
private void skipWhitespace() {
matcher.usePattern(WHITESPACE);
if (matcher.lookingAt()) {
matcher.region(matcher.end(), matcher.regionEnd());
final int textLength = this.text.length();
final int startPos = this.pos;

int endPos = this.pos - 1;
while (++endPos != textLength) {
char c = this.text.charAt(endPos);
if (c == '#') {
while (++endPos != textLength) {
if (this.text.charAt(endPos) == '\n') {
break; // Consume the newline as whitespace.
}
}
if (endPos == textLength) {
break;
}
} else if (isWhitespace(c)) {
// OK
} else {
break;
}
}

this.pos = endPos;
}

/**
Expand Down Expand Up @@ -1148,8 +1240,7 @@ boolean lookingAtInteger() {
return false;
}

final char c = currentToken.charAt(0);
return ('0' <= c && c <= '9') || c == '-' || c == '+';
return isDigitPlusMinus(currentToken.charAt(0));
}

/** Returns {@code true} if the current token's text is equal to that specified. */
Expand All @@ -1164,11 +1255,7 @@ boolean lookingAt(String text) {
String consumeIdentifier() throws ParseException {
for (int i = 0; i < currentToken.length(); i++) {
final char c = currentToken.charAt(i);
if (('a' <= c && c <= 'z')
|| ('A' <= c && c <= 'Z')
|| ('0' <= c && c <= '9')
|| (c == '_')
|| (c == '.')) {
if (isAlphaUnder(c) || ('0' <= c && c <= '9') || (c == '.')) {
// OK
} else {
throw parseException("Expected identifier. Found '" + currentToken + "'");
Expand Down Expand Up @@ -1282,15 +1369,22 @@ public boolean tryConsumeUInt64() {
public double consumeDouble() throws ParseException {
// We need to parse infinity and nan separately because
// Double.parseDouble() does not accept "inf", "infinity", or "nan".
if (DOUBLE_INFINITY.matcher(currentToken).matches()) {
final boolean negative = currentToken.startsWith("-");
nextToken();
return negative ? Double.NEGATIVE_INFINITY : Double.POSITIVE_INFINITY;
}
if (currentToken.equalsIgnoreCase("nan")) {
nextToken();
return Double.NaN;
switch (currentToken.toLowerCase(Locale.ROOT)) {
case "-inf":
case "-infinity":
nextToken();
return Double.NEGATIVE_INFINITY;
case "inf":
case "infinity":
nextToken();
return Double.POSITIVE_INFINITY;
case "nan":
nextToken();
return Double.NaN;
default:
// fall through
}

try {
final double result = Double.parseDouble(currentToken);
nextToken();
Expand Down Expand Up @@ -1320,15 +1414,27 @@ public boolean tryConsumeDouble() {
public float consumeFloat() throws ParseException {
// We need to parse infinity and nan separately because
// Float.parseFloat() does not accept "inf", "infinity", or "nan".
if (FLOAT_INFINITY.matcher(currentToken).matches()) {
final boolean negative = currentToken.startsWith("-");
nextToken();
return negative ? Float.NEGATIVE_INFINITY : Float.POSITIVE_INFINITY;
}
if (FLOAT_NAN.matcher(currentToken).matches()) {
nextToken();
return Float.NaN;
switch (currentToken.toLowerCase(Locale.ROOT)) {
case "-inf":
case "-inff":
case "-infinity":
case "-infinityf":
nextToken();
return Float.NEGATIVE_INFINITY;
case "inf":
case "inff":
case "infinity":
case "infinityf":
nextToken();
return Float.POSITIVE_INFINITY;
case "nan":
case "nanf":
nextToken();
return Float.NaN;
default:
// fall through
}

try {
final float result = Float.parseFloat(currentToken);
nextToken();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ public class TextFormatTest {
+ "repeated_double: 0.125\n"
+ "repeated_double: .125\n"
+ "repeated_double: -.125\n"
+ "repeated_double: .0\n"
+ "repeated_double: 1.23E17\n"
+ "repeated_double: 1.23E+17\n"
+ "repeated_double: -1.23e-17\n"
Expand Down Expand Up @@ -314,6 +315,7 @@ public void testPrintExotic() throws Exception {
.addRepeatedDouble(0.125)
.addRepeatedDouble(.125)
.addRepeatedDouble(-.125)
.addRepeatedDouble(.0)
.addRepeatedDouble(123e15)
.addRepeatedDouble(123e15)
.addRepeatedDouble(-1.23e-17)
Expand Down Expand Up @@ -949,6 +951,9 @@ public void testParseErrors() throws Exception {
"1:23: Enum type \"protobuf_unittest.TestAllTypes.NestedEnum\" has no "
+ "value with number 123.",
"optional_nested_enum: 123");
assertParseError("1:18: Couldn't parse number: For input string: \".\"", "repeated_double: .");
assertParseError(
"1:18: Couldn't parse number: For input string: \".+\"", "repeated_double: .+");

// Delimiters must match.
assertParseError("1:22: Expected identifier. Found '}'", "OptionalGroup < a: 1 }");
Expand Down

0 comments on commit 739b531

Please sign in to comment.