From 58bed0cfe7e9bf044160ab1f62f359c523a975b9 Mon Sep 17 00:00:00 2001 From: David Kunzmann Date: Tue, 31 Oct 2023 14:31:00 +0100 Subject: [PATCH] SONARPY-1539: Fix lexing of raw string (#1626) --- .../sonar/python/lexer/FStringChannel.java | 35 +++++++++++-------- .../org/sonar/python/lexer/FStringState.java | 4 ++- .../org/sonar/python/lexer/LexerState.java | 2 +- .../sonar/python/lexer/PythonLexerTest.java | 8 +++++ .../src/test/resources/parser/own/fstring.py | 2 ++ 5 files changed, 34 insertions(+), 17 deletions(-) diff --git a/python-frontend/src/main/java/org/sonar/python/lexer/FStringChannel.java b/python-frontend/src/main/java/org/sonar/python/lexer/FStringChannel.java index 42cb109644..c569aa2ed5 100644 --- a/python-frontend/src/main/java/org/sonar/python/lexer/FStringChannel.java +++ b/python-frontend/src/main/java/org/sonar/python/lexer/FStringChannel.java @@ -64,7 +64,8 @@ public boolean consume(CodeReader code, Lexer output) { if (canConsumeFStringPrefix(sb, code)) { char quote = code.charAt(0); StringBuilder quotes = consumeFStringQuotes(code, quote); - FStringState newState = new FStringState(Mode.FSTRING_MODE, lexerState.brackets); + boolean isRawString = sb.indexOf("r") >= 0 || sb.indexOf("R") >= 0; + FStringState newState = new FStringState(Mode.FSTRING_MODE, lexerState.brackets, isRawString); newState.setQuote(quote); newState.setNumberOfQuotes(quotes.length()); lexerState.fStringStateStack.push(newState); @@ -78,9 +79,9 @@ public boolean consume(CodeReader code, Lexer output) { FStringState.Mode currentMode = currentState.getTokenizerMode(); if (currentMode == Mode.REGULAR_MODE && lexerState.fStringStateStack.size() > 1) { - // because the lexerState removes one to the count of brackets before entering this channel - // we need to adjust the comparison - if (c == '}' && currentState.getBrackets() -1 == lexerState.brackets) { + // because the lexerState removes one to the count of brackets before entering this channel + // we need to adjust the comparison + if (c == '}' && currentState.getBrackets() - 1 == lexerState.brackets) { Token rCurlyBraceToken = buildToken(PythonPunctuator.RCURLYBRACE, "}", output, line, column); code.pop(); List tokens = new ArrayList<>(); @@ -94,7 +95,7 @@ public boolean consume(CodeReader code, Lexer output) { code.pop(); List tokens = new ArrayList<>(); tokens.add(formatSpecifier); - FStringState newState = new FStringState(Mode.FORMAT_SPECIFIER_MODE, lexerState.brackets); + FStringState newState = new FStringState(Mode.FORMAT_SPECIFIER_MODE, lexerState.brackets, currentState.isRawString); lexerState.fStringStateStack.push(newState); return consumeFStringMiddle(tokens, sb, newState, code, output); } @@ -107,12 +108,16 @@ private boolean consumeFStringMiddle(List tokens, StringBuilder sb, FStri int column = code.getColumnPosition(); FStringState.Mode currentMode = state.getTokenizerMode(); while (code.charAt(0) != EOF) { - if (currentMode == Mode.FSTRING_MODE && isEscapedChar(code) ) { + // In a raw string we consider \ as a character not as escape so we consume it as is + if (currentMode == Mode.FSTRING_MODE && state.isRawString && code.charAt(0) == '\\') { + sb.append((char) code.pop()); + // If we encounter an escaped char we can consume the next two chars directly + } else if (currentMode == Mode.FSTRING_MODE && isEscapedChar(code)) { sb.append((char) code.pop()); sb.append((char) code.pop()); } else if (code.charAt(0) == '{' && !isUnicodeChar(sb)) { addFStringMiddleToTokens(tokens, sb, output, line, column); - addLCurlBraceAndSwitchToRegularMode(tokens, code, output); + addLCurlBraceAndSwitchToRegularMode(tokens, code, output, state); addTokens(tokens, output); return true; } else if (currentMode == Mode.FORMAT_SPECIFIER_MODE && code.charAt(0) == '}') { @@ -140,15 +145,15 @@ private static boolean canConsumeFStringPrefix(StringBuilder sb, CodeReader code return true; } else if (PREFIXES.contains(firstChar) && PREFIXES.contains(secondChar) && !firstChar.equals(secondChar) && QUOTES.contains(code.charAt(2))) { - sb.append((char) code.pop()); - sb.append((char) code.pop()); - return true; - } + sb.append((char) code.pop()); + sb.append((char) code.pop()); + return true; + } return false; } - private static boolean isUnicodeChar(StringBuilder sb ){ - int lastIndexOfUnicodeChar = sb.lastIndexOf("\\N"); + private static boolean isUnicodeChar(StringBuilder sb) { + int lastIndexOfUnicodeChar = sb.lastIndexOf("\\N"); return lastIndexOfUnicodeChar >= 0 && lastIndexOfUnicodeChar == sb.length() - 2; } @@ -178,11 +183,11 @@ private void addFStringEndToTokens(CodeReader code, char quote, List toke tokens.add(fStringEndToken); } - private void addLCurlBraceAndSwitchToRegularMode(List tokens, CodeReader code, Lexer output) { + private void addLCurlBraceAndSwitchToRegularMode(List tokens, CodeReader code, Lexer output, FStringState currentState) { Token curlyBraceToken = buildToken(PythonPunctuator.LCURLYBRACE, "{", output, code.getLinePosition(), code.getColumnPosition()); code.pop(); lexerState.brackets++; - FStringState updatedState = new FStringState(FStringState.Mode.REGULAR_MODE, lexerState.brackets); + FStringState updatedState = new FStringState(FStringState.Mode.REGULAR_MODE, lexerState.brackets, currentState.isRawString); lexerState.fStringStateStack.push(updatedState); tokens.add(curlyBraceToken); } diff --git a/python-frontend/src/main/java/org/sonar/python/lexer/FStringState.java b/python-frontend/src/main/java/org/sonar/python/lexer/FStringState.java index d093c5f530..e1440b4930 100644 --- a/python-frontend/src/main/java/org/sonar/python/lexer/FStringState.java +++ b/python-frontend/src/main/java/org/sonar/python/lexer/FStringState.java @@ -24,6 +24,7 @@ public class FStringState { Character quote; int numberOfQuotes; int brackets; + boolean isRawString; public enum Mode { @@ -34,9 +35,10 @@ public enum Mode { private Mode tokenizerMode; - public FStringState(Mode mode, int brackets) { + public FStringState(Mode mode, int brackets, boolean isRawString) { this.tokenizerMode = mode; this.brackets = brackets; + this.isRawString = isRawString; } public Character getQuote() { diff --git a/python-frontend/src/main/java/org/sonar/python/lexer/LexerState.java b/python-frontend/src/main/java/org/sonar/python/lexer/LexerState.java index fb613a1e62..b5258b822b 100644 --- a/python-frontend/src/main/java/org/sonar/python/lexer/LexerState.java +++ b/python-frontend/src/main/java/org/sonar/python/lexer/LexerState.java @@ -42,7 +42,7 @@ public void reset() { brackets = 0; joined = false; fStringStateStack.clear(); - fStringStateStack.push(new FStringState(Mode.REGULAR_MODE, brackets)); + fStringStateStack.push(new FStringState(Mode.REGULAR_MODE, brackets, false)); } public void reset(int initialLine, int initialColumn) { diff --git a/python-frontend/src/test/java/org/sonar/python/lexer/PythonLexerTest.java b/python-frontend/src/test/java/org/sonar/python/lexer/PythonLexerTest.java index 61d1dd92f9..0b9584b9b9 100644 --- a/python-frontend/src/test/java/org/sonar/python/lexer/PythonLexerTest.java +++ b/python-frontend/src/test/java/org/sonar/python/lexer/PythonLexerTest.java @@ -528,6 +528,14 @@ void fstring_complex_format_specifier() { hasToken("\"", PythonTokenType.FSTRING_END))); } + @Test + void fstring_escaped_regex_pattern() { + assertThat(lexer.lex("rf\"\\{{\\n\\}}\""), allOf( + hasToken("rf\"", PythonTokenType.FSTRING_START), + hasToken("\\{{\\n\\}}", PythonTokenType.FSTRING_MIDDLE), + hasToken("\"", PythonTokenType.FSTRING_END))); + } + @Test void fstring_double_backslash() { assertThat(lexer.lex("f\"{a}\\\\\""), allOf( diff --git a/python-frontend/src/test/resources/parser/own/fstring.py b/python-frontend/src/test/resources/parser/own/fstring.py index ddf1559869..50bd752540 100644 --- a/python-frontend/src/test/resources/parser/own/fstring.py +++ b/python-frontend/src/test/resources/parser/own/fstring.py @@ -26,3 +26,5 @@ f'\N{RIGHTWARDS ARROW}' f" \\" F"\\ \"{a}\":\\" +fr"""\s*\{{(.+)\}}""" +rf'^add_example\(\s*"[^"]*",\s*{foo()},\s*\d+,\s*async \(client, console\) => \{{\n(.*?)^(?:\}}| *\}},\n)\);$'