Skip to content

Commit

Permalink
SONARPY-1539: Fix lexing of raw string (#1626)
Browse files Browse the repository at this point in the history
  • Loading branch information
joke1196 authored Oct 31, 2023
1 parent 09ede81 commit 58bed0c
Show file tree
Hide file tree
Showing 5 changed files with 34 additions and 17 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,8 @@ public boolean consume(CodeReader code, Lexer output) {
if (canConsumeFStringPrefix(sb, code)) {
char quote = code.charAt(0);
StringBuilder quotes = consumeFStringQuotes(code, quote);
FStringState newState = new FStringState(Mode.FSTRING_MODE, lexerState.brackets);
boolean isRawString = sb.indexOf("r") >= 0 || sb.indexOf("R") >= 0;
FStringState newState = new FStringState(Mode.FSTRING_MODE, lexerState.brackets, isRawString);
newState.setQuote(quote);
newState.setNumberOfQuotes(quotes.length());
lexerState.fStringStateStack.push(newState);
Expand All @@ -78,9 +79,9 @@ public boolean consume(CodeReader code, Lexer output) {
FStringState.Mode currentMode = currentState.getTokenizerMode();

if (currentMode == Mode.REGULAR_MODE && lexerState.fStringStateStack.size() > 1) {
// because the lexerState removes one to the count of brackets before entering this channel
// we need to adjust the comparison
if (c == '}' && currentState.getBrackets() -1 == lexerState.brackets) {
// because the lexerState removes one to the count of brackets before entering this channel
// we need to adjust the comparison
if (c == '}' && currentState.getBrackets() - 1 == lexerState.brackets) {
Token rCurlyBraceToken = buildToken(PythonPunctuator.RCURLYBRACE, "}", output, line, column);
code.pop();
List<Token> tokens = new ArrayList<>();
Expand All @@ -94,7 +95,7 @@ public boolean consume(CodeReader code, Lexer output) {
code.pop();
List<Token> tokens = new ArrayList<>();
tokens.add(formatSpecifier);
FStringState newState = new FStringState(Mode.FORMAT_SPECIFIER_MODE, lexerState.brackets);
FStringState newState = new FStringState(Mode.FORMAT_SPECIFIER_MODE, lexerState.brackets, currentState.isRawString);
lexerState.fStringStateStack.push(newState);
return consumeFStringMiddle(tokens, sb, newState, code, output);
}
Expand All @@ -107,12 +108,16 @@ private boolean consumeFStringMiddle(List<Token> tokens, StringBuilder sb, FStri
int column = code.getColumnPosition();
FStringState.Mode currentMode = state.getTokenizerMode();
while (code.charAt(0) != EOF) {
if (currentMode == Mode.FSTRING_MODE && isEscapedChar(code) ) {
// In a raw string we consider \ as a character not as escape so we consume it as is
if (currentMode == Mode.FSTRING_MODE && state.isRawString && code.charAt(0) == '\\') {
sb.append((char) code.pop());
// If we encounter an escaped char we can consume the next two chars directly
} else if (currentMode == Mode.FSTRING_MODE && isEscapedChar(code)) {
sb.append((char) code.pop());
sb.append((char) code.pop());
} else if (code.charAt(0) == '{' && !isUnicodeChar(sb)) {
addFStringMiddleToTokens(tokens, sb, output, line, column);
addLCurlBraceAndSwitchToRegularMode(tokens, code, output);
addLCurlBraceAndSwitchToRegularMode(tokens, code, output, state);
addTokens(tokens, output);
return true;
} else if (currentMode == Mode.FORMAT_SPECIFIER_MODE && code.charAt(0) == '}') {
Expand Down Expand Up @@ -140,15 +145,15 @@ private static boolean canConsumeFStringPrefix(StringBuilder sb, CodeReader code
return true;
} else if (PREFIXES.contains(firstChar) && PREFIXES.contains(secondChar) &&
!firstChar.equals(secondChar) && QUOTES.contains(code.charAt(2))) {
sb.append((char) code.pop());
sb.append((char) code.pop());
return true;
}
sb.append((char) code.pop());
sb.append((char) code.pop());
return true;
}
return false;
}

private static boolean isUnicodeChar(StringBuilder sb ){
int lastIndexOfUnicodeChar = sb.lastIndexOf("\\N");
private static boolean isUnicodeChar(StringBuilder sb) {
int lastIndexOfUnicodeChar = sb.lastIndexOf("\\N");
return lastIndexOfUnicodeChar >= 0 && lastIndexOfUnicodeChar == sb.length() - 2;
}

Expand Down Expand Up @@ -178,11 +183,11 @@ private void addFStringEndToTokens(CodeReader code, char quote, List<Token> toke
tokens.add(fStringEndToken);
}

private void addLCurlBraceAndSwitchToRegularMode(List<Token> tokens, CodeReader code, Lexer output) {
private void addLCurlBraceAndSwitchToRegularMode(List<Token> tokens, CodeReader code, Lexer output, FStringState currentState) {
Token curlyBraceToken = buildToken(PythonPunctuator.LCURLYBRACE, "{", output, code.getLinePosition(), code.getColumnPosition());
code.pop();
lexerState.brackets++;
FStringState updatedState = new FStringState(FStringState.Mode.REGULAR_MODE, lexerState.brackets);
FStringState updatedState = new FStringState(FStringState.Mode.REGULAR_MODE, lexerState.brackets, currentState.isRawString);
lexerState.fStringStateStack.push(updatedState);
tokens.add(curlyBraceToken);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ public class FStringState {
Character quote;
int numberOfQuotes;
int brackets;
boolean isRawString;


public enum Mode {
Expand All @@ -34,9 +35,10 @@ public enum Mode {

private Mode tokenizerMode;

public FStringState(Mode mode, int brackets) {
public FStringState(Mode mode, int brackets, boolean isRawString) {
this.tokenizerMode = mode;
this.brackets = brackets;
this.isRawString = isRawString;
}

public Character getQuote() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ public void reset() {
brackets = 0;
joined = false;
fStringStateStack.clear();
fStringStateStack.push(new FStringState(Mode.REGULAR_MODE, brackets));
fStringStateStack.push(new FStringState(Mode.REGULAR_MODE, brackets, false));
}

public void reset(int initialLine, int initialColumn) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -528,6 +528,14 @@ void fstring_complex_format_specifier() {
hasToken("\"", PythonTokenType.FSTRING_END)));
}

@Test
void fstring_escaped_regex_pattern() {
assertThat(lexer.lex("rf\"\\{{\\n\\}}\""), allOf(
hasToken("rf\"", PythonTokenType.FSTRING_START),
hasToken("\\{{\\n\\}}", PythonTokenType.FSTRING_MIDDLE),
hasToken("\"", PythonTokenType.FSTRING_END)));
}

@Test
void fstring_double_backslash() {
assertThat(lexer.lex("f\"{a}\\\\\""), allOf(
Expand Down
2 changes: 2 additions & 0 deletions python-frontend/src/test/resources/parser/own/fstring.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,5 @@
f'\N{RIGHTWARDS ARROW}'
f" \\"
F"\\ \"{a}\":\\"
fr"""\s*\{{(.+)\}}"""
rf'^add_example\(\s*"[^"]*",\s*{foo()},\s*\d+,\s*async \(client, console\) => \{{\n(.*?)^(?:\}}| *\}},\n)\);$'

0 comments on commit 58bed0c

Please sign in to comment.