From 04fbe9a2379d3e5e2fe929f698a4f44de7a7e812 Mon Sep 17 00:00:00 2001 From: Kyle Miller Date: Mon, 20 Nov 2023 12:47:48 -0800 Subject: [PATCH] feat: Rust-style raw string literals For example, `r#"The word "this" is in quotes."#`. Implements #1422 --- doc/lexical_structure.md | 18 ++++++- src/Init/Meta.lean | 13 ++++- src/Lean/Parser/Basic.lean | 59 +++++++++++++++++++++++ tests/lean/rawStringEOF.lean | 1 + tests/lean/rawStringEOF.lean.expected.out | 1 + tests/lean/run/rawStrings.lean | 16 ++++++ 6 files changed, 106 insertions(+), 2 deletions(-) create mode 100644 tests/lean/rawStringEOF.lean create mode 100644 tests/lean/rawStringEOF.lean.expected.out create mode 100644 tests/lean/run/rawStrings.lean diff --git a/doc/lexical_structure.md b/doc/lexical_structure.md index 3b62d226bd4d..9d1d89a810a8 100644 --- a/doc/lexical_structure.md +++ b/doc/lexical_structure.md @@ -8,7 +8,7 @@ A Lean program consists of a stream of UTF-8 tokens where each token is one of the following: ``` - token: symbol | command | ident | string | char | numeral | + token: symbol | command | ident | string | raw_string | char | numeral | : decimal | doc_comment | mod_doc_comment | field_notation ``` @@ -90,6 +90,22 @@ So the complete syntax is: hex_char : [0-9a-fA-F] ``` +Raw String Literals +=================== + +Raw string literals are string literals without any escape character processing. +They begin with `r##...#"` (with zero or more `#` characters) and end with `"#...##` (with the same number of `#` characters). +The contents of a raw string literal may contain `"##..#` so long as the number of `#` characters +is less than the number of `#` characters used to begin the raw string literal. + +``` + raw_string : raw_string_aux(0) | raw_string_aux(1) | raw_string_aux(2) | ... + raw_string_aux(n) : 'r' '#'{n} '"' raw_string_item '"' '#'{n} + raw_string_item(n) : raw_string_char | raw_string_quote(n) + raw_string_char : [^"] + raw_string_quote(n) : '"' '#'{0..n-1} +``` + Char Literals ============= diff --git a/src/Init/Meta.lean b/src/Init/Meta.lean index dee5526ebb25..a21900e24e5c 100644 --- a/src/Init/Meta.lean +++ b/src/Init/Meta.lean @@ -786,8 +786,19 @@ partial def decodeStrLitAux (s : String) (i : String.Pos) (acc : String) : Optio else decodeStrLitAux s i (acc.push c) +partial def decodeRawStrLitAux (s : String) (i : String.Pos) (num : Nat) : Option String := do + let c := s.get i + let i := s.next i + if c == '#' then + decodeRawStrLitAux s i (num + 1) + else + s.extract i ⟨s.utf8ByteSize - (num + 1)⟩ + def decodeStrLit (s : String) : Option String := - decodeStrLitAux s ⟨1⟩ "" + if s.get 0 == 'r' then + decodeRawStrLitAux s ⟨1⟩ 0 + else + decodeStrLitAux s ⟨1⟩ "" def isStrLit? (stx : Syntax) : Option String := match isLit? strLitKind stx with diff --git a/src/Lean/Parser/Basic.lean b/src/Lean/Parser/Basic.lean index d34063e0557f..8b85bf68bbb3 100644 --- a/src/Lean/Parser/Basic.lean +++ b/src/Lean/Parser/Basic.lean @@ -627,6 +627,63 @@ partial def strLitFnAux (startPos : String.Pos) : ParserFn := fun c s => else if curr == '\\' then andthenFn quotedCharFn (strLitFnAux startPos) c s else strLitFnAux startPos c s +/-- Raw strings have the syntax `r##...#"..."#...##` with zero or more `#`'s. +If this is a valid start to a raw string (`r##...#"`), +returns the parser state after the `"` along with the number of `#`'s -/ +partial def rawStrLitStart (c : ParserContext) (s : ParserState) : Option (ParserState × Nat) := + let input := c.input + let i := s.pos + if h : input.atEnd i then none + else + let curr := input.get' i h + if curr == 'r' then + loop (s.next' input i h) 0 + else + none +where + loop (s : ParserState) (num : Nat) : Option (ParserState × Nat) := + let input := c.input + let i := s.pos + if h : input.atEnd i then none + else + let curr := input.get' i h + if curr == '#' then + loop (s.next' input i h) (num + 1) + else if curr == '"' then + some (s.next' input i h, num) + else + none + +/-- Parses a raw string literal after `rawStrLitStart` succeeds. +The `startPos` is the start of the raw string (at the `r`). +The number of hashes is `num`. +The parser state is assumed to be immediately after the first `"`. +If `closing?` is `some closingNum` then we are currently looking at `"##...#` with `closingNum` hashes. -/ +partial def rawStrLitFnAux (startPos : String.Pos) (num : Nat) (closing? : Option Nat) : ParserFn := fun c s => + let input := c.input + let i := s.pos + if h : input.atEnd i then s.mkUnexpectedErrorAt "unterminated raw string literal" startPos + else + let curr := input.get' i h + let s := s.setPos (input.next' i h) + if let some closingNum := closing? then + if curr == '#' then + if closingNum + 1 == num then + mkNodeToken strLitKind startPos c s + else + rawStrLitFnAux startPos num (some <| closingNum + 1) c s + else if curr == '\"' then + rawStrLitFnAux startPos num (some 0) c s + else + rawStrLitFnAux startPos num none c s + else if curr == '\"' then + if num == 0 then + mkNodeToken strLitKind startPos c s + else + rawStrLitFnAux startPos num (some 0) c s + else + rawStrLitFnAux startPos num none c s + def decimalNumberFn (startPos : String.Pos) (c : ParserContext) : ParserState → ParserState := fun s => let s := takeWhileFn (fun c => c.isDigit) c s let input := c.input @@ -820,6 +877,8 @@ private def tokenFnAux : ParserFn := fun c s => numberFnAux c s else if curr == '`' && isIdFirstOrBeginEscape (getNext input i) then nameLitAux i c s + else if let some (s, num) := rawStrLitStart c s then + rawStrLitFnAux i num none c s else let tk := c.tokens.matchPrefix input i identFnAux i tk .anonymous c s diff --git a/tests/lean/rawStringEOF.lean b/tests/lean/rawStringEOF.lean new file mode 100644 index 000000000000..e6ca73b9fa3d --- /dev/null +++ b/tests/lean/rawStringEOF.lean @@ -0,0 +1 @@ +#check r###"this is a raw string, unterminated"## diff --git a/tests/lean/rawStringEOF.lean.expected.out b/tests/lean/rawStringEOF.lean.expected.out new file mode 100644 index 000000000000..38feb3454659 --- /dev/null +++ b/tests/lean/rawStringEOF.lean.expected.out @@ -0,0 +1 @@ +rawStringEOF.lean:1:7: error: unterminated raw string literal diff --git a/tests/lean/run/rawStrings.lean b/tests/lean/run/rawStrings.lean new file mode 100644 index 000000000000..37b2adaa75d6 --- /dev/null +++ b/tests/lean/run/rawStrings.lean @@ -0,0 +1,16 @@ +example : r"" = "" := rfl +example : r#""# = "" := rfl +example : r"hi" = "hi" := rfl +example : r#"hi"# = "hi" := rfl +example : r#""hello""# = "\"hello\"" := rfl +example : r#""hello" said the world"# = "\"hello\" said the world" := rfl +example : r#"""# = "\"" := rfl +example : r"\n" = "\\n" := rfl +example : r"\" = "\\" := rfl +example : r#"#"# = "#" := rfl +example : r##"need two #'s in "# to close"## = "need two #'s in \"# to close" := rfl +example : r##"foo #"# bar"## = "foo #\"# bar" := rfl + +infix:100 " # " => Prod.mk + +example : r#"a"##"b" = ("a", "b") := rfl