From 75c1690097f3a5b1fa0e04598ea19b48c53d0b43 Mon Sep 17 00:00:00 2001 From: David Kunzmann Date: Wed, 1 Nov 2023 09:40:39 +0100 Subject: [PATCH] SONARPY-1542: Fix lexer as raw strings consider backslash as a character except before quotes (#1628) --- .../java/org/sonar/python/lexer/FStringChannel.java | 11 ++++++++--- .../java/org/sonar/python/lexer/PythonLexerTest.java | 9 +++++++-- .../src/test/resources/parser/own/fstring.py | 1 + 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/python-frontend/src/main/java/org/sonar/python/lexer/FStringChannel.java b/python-frontend/src/main/java/org/sonar/python/lexer/FStringChannel.java index c569aa2ed5..5f1db38ced 100644 --- a/python-frontend/src/main/java/org/sonar/python/lexer/FStringChannel.java +++ b/python-frontend/src/main/java/org/sonar/python/lexer/FStringChannel.java @@ -108,10 +108,11 @@ private boolean consumeFStringMiddle(List tokens, StringBuilder sb, FStri int column = code.getColumnPosition(); FStringState.Mode currentMode = state.getTokenizerMode(); while (code.charAt(0) != EOF) { - // In a raw string we consider \ as a character not as escape so we consume it as is - if (currentMode == Mode.FSTRING_MODE && state.isRawString && code.charAt(0) == '\\') { + // In a raw string we consider \ as a character not as escape so we consume it as is. + // Except for quotes which will be consumed as an escaped char + if (currentMode == Mode.FSTRING_MODE && isRawStringBackSlash(code, state)) { sb.append((char) code.pop()); - // If we encounter an escaped char we can consume the next two chars directly + // If we encounter an escaped char we can consume the next two chars directly } else if (currentMode == Mode.FSTRING_MODE && isEscapedChar(code)) { sb.append((char) code.pop()); sb.append((char) code.pop()); @@ -152,6 +153,10 @@ private static boolean canConsumeFStringPrefix(StringBuilder sb, CodeReader code return false; } + private static boolean isRawStringBackSlash(CodeReader code, FStringState state) { + return state.isRawString && code.charAt(0) == '\\' && !QUOTES.contains(code.charAt(1)); + } + private static boolean isUnicodeChar(StringBuilder sb) { int lastIndexOfUnicodeChar = sb.lastIndexOf("\\N"); return lastIndexOfUnicodeChar >= 0 && lastIndexOfUnicodeChar == sb.length() - 2; diff --git a/python-frontend/src/test/java/org/sonar/python/lexer/PythonLexerTest.java b/python-frontend/src/test/java/org/sonar/python/lexer/PythonLexerTest.java index 0b9584b9b9..0a78c05e7d 100644 --- a/python-frontend/src/test/java/org/sonar/python/lexer/PythonLexerTest.java +++ b/python-frontend/src/test/java/org/sonar/python/lexer/PythonLexerTest.java @@ -530,12 +530,17 @@ void fstring_complex_format_specifier() { @Test void fstring_escaped_regex_pattern() { - assertThat(lexer.lex("rf\"\\{{\\n\\}}\""), allOf( + assertThat(lexer.lex("rf\"\\{{\\n\\}}\\\"{a}\\\"\""), allOf( hasToken("rf\"", PythonTokenType.FSTRING_START), - hasToken("\\{{\\n\\}}", PythonTokenType.FSTRING_MIDDLE), + hasToken("\\{{\\n\\}}\\\"", PythonTokenType.FSTRING_MIDDLE), + hasToken("{", PythonPunctuator.LCURLYBRACE), + hasToken("a", GenericTokenType.IDENTIFIER), + hasToken("}", PythonPunctuator.RCURLYBRACE), + hasToken("\\\"", PythonTokenType.FSTRING_MIDDLE), hasToken("\"", PythonTokenType.FSTRING_END))); } + @Test void fstring_double_backslash() { assertThat(lexer.lex("f\"{a}\\\\\""), allOf( diff --git a/python-frontend/src/test/resources/parser/own/fstring.py b/python-frontend/src/test/resources/parser/own/fstring.py index 50bd752540..0d1a1efa67 100644 --- a/python-frontend/src/test/resources/parser/own/fstring.py +++ b/python-frontend/src/test/resources/parser/own/fstring.py @@ -28,3 +28,4 @@ F"\\ \"{a}\":\\" fr"""\s*\{{(.+)\}}""" rf'^add_example\(\s*"[^"]*",\s*{foo()},\s*\d+,\s*async \(client, console\) => \{{\n(.*?)^(?:\}}| *\}},\n)\);$' +fr'\"foo\"\s*{42}'