From 639ce57ad53d9dcf8ec02e61d50e9f1fc6a688aa Mon Sep 17 00:00:00 2001 From: QuarticCat Date: Thu, 22 Jun 2023 16:16:40 +0800 Subject: [PATCH 01/42] rename ParserTree to ParseTree --- pag-parser/src/fusion.rs | 14 +++++++------- pag-parser/src/lib.rs | 2 +- tests/arith-expr/src/lib.rs | 2 +- tests/sexpr-calculator/src/lib.rs | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/pag-parser/src/fusion.rs b/pag-parser/src/fusion.rs index e66e6ca..b98d1a2 100644 --- a/pag-parser/src/fusion.rs +++ b/pag-parser/src/fusion.rs @@ -35,14 +35,14 @@ fn generate_tag_enum(parser: &Parser<'_, '_>) -> TokenStream { fn generate_parse_tree() -> TokenStream { quote! { #[derive(Debug, Clone, PartialEq, Eq, Hash)] - pub struct ParserTree<'a> { + pub struct ParseTree<'a> { tag: Tag, src: &'a str, span: core::ops::Range, children: alloc::vec::Vec } - impl <'a> ParserTree<'a> { + impl <'a> ParseTree<'a> { pub fn new(tag: Tag, src: &'a str) -> Self { Self { tag, @@ -169,7 +169,7 @@ fn generate_empty_actions(active: bool, symbols: &[Symbol<'_>]) -> Vec( if let Some(sym) = next_tree_indices.get(&0) { let tag = format_ident!("{}", sym.name()); actions.push(quote! { - let mut subtree = ParserTree::new(Tag::#tag, src); + let mut subtree = ParseTree::new(Tag::#tag, src); }); subtree = true; } @@ -341,7 +341,7 @@ fn generate_inactive_parser<'src>( fn #parser_name<'a>( src: &'a str, mut offset: usize, - parent: &mut ParserTree<'a>, + parent: &mut ParseTree<'a>, ) -> Result { #expect let mut cursor; @@ -400,9 +400,9 @@ fn generate_active_parser<'src>( fn #parser_name( src: &str, mut offset: usize, - ) -> Result { + ) -> Result { #expect - let mut tree = ParserTree::new(Tag::#tag_ident, src); + let mut tree = ParseTree::new(Tag::#tag_ident, src); let mut cursor; 'parser: loop { cursor = offset; diff --git a/pag-parser/src/lib.rs b/pag-parser/src/lib.rs index db02afd..355eb87 100644 --- a/pag-parser/src/lib.rs +++ b/pag-parser/src/lib.rs @@ -282,7 +282,7 @@ pub fn generate_parser(input: &str) -> Result { clippy::match_single_binding, )] #parser_routines - pub fn parse(input: &str) -> Result { + pub fn parse(input: &str) -> Result { #entrypoint(input, 0) } }) diff --git a/tests/arith-expr/src/lib.rs b/tests/arith-expr/src/lib.rs index b528f03..f6def5e 100644 --- a/tests/arith-expr/src/lib.rs +++ b/tests/arith-expr/src/lib.rs @@ -6,7 +6,7 @@ use std::num::Wrapping; mod parser; #[allow(dead_code)] -fn eval(tree: &parser::ParserTree) -> Wrapping { +fn eval(tree: &parser::ParseTree) -> Wrapping { match tree.tag() { parser::Tag::expr => tree.children()[..].iter().map(eval).sum(), parser::Tag::mult => tree.children()[..].iter().map(eval).product(), diff --git a/tests/sexpr-calculator/src/lib.rs b/tests/sexpr-calculator/src/lib.rs index ef9fa51..e3a768c 100644 --- a/tests/sexpr-calculator/src/lib.rs +++ b/tests/sexpr-calculator/src/lib.rs @@ -6,7 +6,7 @@ use std::num::Wrapping; mod parser; #[allow(dead_code)] -fn eval(tree: &parser::ParserTree) -> Wrapping { +fn eval(tree: &parser::ParseTree) -> Wrapping { match tree.tag() { parser::Tag::sexpr => eval(&tree.children()[0]), parser::Tag::int => Wrapping(tree.as_slice().parse::().unwrap()), From 9d92fd05ec9a6aa99690bee2872532a6526422db Mon Sep 17 00:00:00 2001 From: QuarticCat Date: Thu, 22 Jun 2023 18:36:50 +0800 Subject: [PATCH 02/42] avoid name conflicts --- pag-parser/src/fusion.rs | 8 +++----- pag-parser/src/lib.rs | 2 +- pag-parser/src/nf.rs | 6 +----- 3 files changed, 5 insertions(+), 11 deletions(-) diff --git a/pag-parser/src/fusion.rs b/pag-parser/src/fusion.rs index b98d1a2..f8c12b2 100644 --- a/pag-parser/src/fusion.rs +++ b/pag-parser/src/fusion.rs @@ -303,8 +303,7 @@ fn generate_inactive_parser<'src>( rules: &[&NormalForm<'src>], loop_optimizer: &mut LoopOptimizer, ) -> TokenStream { - let tag_name = format!("{tag}"); - let parser_name = format_ident!("parse_{tag_name}"); + let parser_name = format_ident!("parse_{tag}"); let expect = generate_expect(rules); let success_actions = generate_children(&tag, false, parser, rules) @@ -362,9 +361,8 @@ fn generate_active_parser<'src>( rules: &[&NormalForm<'src>], loop_optimizer: &mut LoopOptimizer, ) -> TokenStream { - let tag_name = format!("{tag}"); - let tag_ident = format_ident!("{tag_name}"); - let parser_name = format_ident!("parse_{tag_name}"); + let tag_ident = format_ident!("{}", tag.symbol().name()); + let parser_name = format_ident!("parse_{tag}"); let expect = generate_expect(rules); let success_actions = generate_children(&tag, true, parser, rules) diff --git a/pag-parser/src/lib.rs b/pag-parser/src/lib.rs index 355eb87..a980cfc 100644 --- a/pag-parser/src/lib.rs +++ b/pag-parser/src/lib.rs @@ -268,7 +268,7 @@ pub fn generate_parser(input: &str) -> Result { merge_inactive_rules(&mut nfs, &parser, &nf_arena); remove_unreachable_rules(&mut nfs, &parser); let parser_routines = fusion_parser(&nfs, &parser); - let entrypoint = format_ident!("parse_{}", parser.entrypoint.name()); + let entrypoint = format_ident!("parse_{}_0", parser.entrypoint.name()); Ok(quote::quote! { #![allow( dead_code, diff --git a/pag-parser/src/nf.rs b/pag-parser/src/nf.rs index 1b05d52..47433e8 100644 --- a/pag-parser/src/nf.rs +++ b/pag-parser/src/nf.rs @@ -38,11 +38,7 @@ impl<'src> Tag<'src> { impl<'src> Display for Tag<'src> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - self.symbol.fmt(f)?; - if self.version > 0 { - write!(f, "_{}", self.version)?; - } - Ok(()) + write!(f, "{}_{}", self.symbol, self.version) } } From f5b18668d0ac73b9b4912c83fd70115d3e39c389 Mon Sep 17 00:00:00 2001 From: QuarticCat Date: Mon, 17 Jul 2023 23:36:28 +0800 Subject: [PATCH 03/42] make clippy happy --- pag-lexer/src/utilities.rs | 2 +- pag-parser/src/lib.rs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pag-lexer/src/utilities.rs b/pag-lexer/src/utilities.rs index b14a70d..adb2440 100644 --- a/pag-lexer/src/utilities.rs +++ b/pag-lexer/src/utilities.rs @@ -10,7 +10,7 @@ where } #[cfg(debug_assertions)] { - let mut vec = Vec::from_iter(data.into_iter()); + let mut vec = Vec::from_iter(data); vec.sort_unstable_by_key(_f); vec.into_iter() } diff --git a/pag-parser/src/lib.rs b/pag-parser/src/lib.rs index a980cfc..0687b7b 100644 --- a/pag-parser/src/lib.rs +++ b/pag-parser/src/lib.rs @@ -80,7 +80,7 @@ impl<'src> Error<'src> { Report::build(ReportKind::Error, input_name, span.start) .with_message("Syntax error in grammar definition") .with_label(Label::new((input_name, span)) - .with_message(format!("{}", x.variant.message())) + .with_message(x.variant.message()) .with_color(Color::Red)) .finish() }, @@ -88,7 +88,7 @@ impl<'src> Error<'src> { Report::build(ReportKind::Error, input_name, span.start()) .with_message("Format error in grammar definition") .with_label(Label::new((input_name, span.start()..span.end())) - .with_message(format!("{}", message)) + .with_message(message) .with_color(Color::Red)) .finish() }, From f7792d64fa8ac1085d3e7eb61b178db060d03ada Mon Sep 17 00:00:00 2001 From: QuarticCat Date: Tue, 25 Jul 2023 04:02:08 +0800 Subject: [PATCH 04/42] impl prototype of new frontend (wip) --- Cargo.toml | 11 +- pag-parser2/Cargo.toml | 26 +++++ pag-parser2/src/frontend/ast.rs | 56 ++++++++++ pag-parser2/src/frontend/mod.rs | 13 +++ pag-parser2/src/frontend/parse.rs | 165 ++++++++++++++++++++++++++++++ pag-parser2/src/lib.rs | 9 ++ 6 files changed, 270 insertions(+), 10 deletions(-) create mode 100644 pag-parser2/Cargo.toml create mode 100644 pag-parser2/src/frontend/ast.rs create mode 100644 pag-parser2/src/frontend/mod.rs create mode 100644 pag-parser2/src/frontend/parse.rs create mode 100644 pag-parser2/src/lib.rs diff --git a/Cargo.toml b/Cargo.toml index adb3388..7a4017e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,16 +7,7 @@ # modified, or distributed except according to those terms. [workspace] -members = [ - "pag-lexer", - "pag-parser", - "pag-compiler", - "tests/sexpr-calculator", - "tests/arith-expr", - "tests/tokenizer", - "benches/csv", - "benches/json", -] +members = ["pag-*", "tests/*", "benches/*"] resolver = "2" [workspace.package] diff --git a/pag-parser2/Cargo.toml b/pag-parser2/Cargo.toml new file mode 100644 index 0000000..ce26614 --- /dev/null +++ b/pag-parser2/Cargo.toml @@ -0,0 +1,26 @@ +# Copyright (c) 2023 Paguroidea Developers +# +# Licensed under the Apache License, Version 2.0 +# or the MIT +# license , at your +# option. All files in the project carrying such notice may not be copied, +# modified, or distributed except according to those terms. + +[package] +name = "pag-parser2" +keywords = ["parser", "cfg", "grammar"] +description = "Parser-lexer fusion generator (parser generator)" +documentation = "https://docs.rs/pag-parser/" + +version.workspace = true +edition.workspace = true +license.workspace = true +exclude.workspace = true +categories.workspace = true +repository.workspace = true +rust-version.workspace = true +authors.workspace = true +readme.workspace = true + +[dependencies] +syn = "2.0.27" diff --git a/pag-parser2/src/frontend/ast.rs b/pag-parser2/src/frontend/ast.rs new file mode 100644 index 0000000..f591cbf --- /dev/null +++ b/pag-parser2/src/frontend/ast.rs @@ -0,0 +1,56 @@ +// Copyright (c) 2023 Paguroidea Developers +// +// Licensed under the Apache License, Version 2.0 +// or the MIT +// license , at your +// option. All files in the project carrying such notice may not be copied, +// modified, or distributed except according to those terms. + +use std::collections::HashMap; + +pub struct Ast { + pub entry: syn::Ident, + pub skip: Option, + pub lexer_map: HashMap, + pub parser_map: HashMap, +} + +pub struct ParserDef { + pub ty: syn::Type, + pub rules: Vec, +} + +pub struct ParserRule { + pub bindings: Vec, + pub action: Option, +} + +pub struct ParserBinding { + pub name: Option, + pub ty: Option, + pub tree: ParserTree, +} + +// TODO: how to express "bottom" & "empty"? +pub enum LexerTree { + Alt(Vec>), + Seq(Vec>), + And(Vec>), + Star(Box), + Plus(Box), + Opt(Box), + Not(Box), + Ref(syn::Ident), + Str(syn::LitStr), + Range(syn::LitChar, syn::LitChar), +} + +// TODO: how to express "select" & "ignore"? +pub enum ParserTree { + Seq(Vec>), + Star(Box), + Plus(Box), + Opt(Box), + LexerRef(syn::Ident), + ParserRef(syn::Ident), +} diff --git a/pag-parser2/src/frontend/mod.rs b/pag-parser2/src/frontend/mod.rs new file mode 100644 index 0000000..777e86d --- /dev/null +++ b/pag-parser2/src/frontend/mod.rs @@ -0,0 +1,13 @@ +// Copyright (c) 2023 Paguroidea Developers +// +// Licensed under the Apache License, Version 2.0 +// or the MIT +// license , at your +// option. All files in the project carrying such notice may not be copied, +// modified, or distributed except according to those terms. + +mod ast; +mod parse; + +pub use ast::*; +pub use parse::*; diff --git a/pag-parser2/src/frontend/parse.rs b/pag-parser2/src/frontend/parse.rs new file mode 100644 index 0000000..1eb6c61 --- /dev/null +++ b/pag-parser2/src/frontend/parse.rs @@ -0,0 +1,165 @@ +// Copyright (c) 2023 Paguroidea Developers +// +// Licensed under the Apache License, Version 2.0 +// or the MIT +// license , at your +// option. All files in the project carrying such notice may not be copied, +// modified, or distributed except according to those terms. + +use super::ast::*; + +use syn::parse::{Parse, ParseStream}; +use syn::punctuated::Punctuated; +use syn::{parse_quote, Token}; + +use std::collections::HashMap; + +enum IdentKind { + LexerName, + ParserName, + Invalid, +} + +fn ident_kind(ident: &syn::Ident) -> IdentKind { + let s = ident.to_string(); // TODO: should we add a `.unraw()` ? + if s.chars().all(|c| matches!(c, 'A'..='Z' | '0'..='9' | '_')) { + return IdentKind::LexerName; + } + if s.chars().all(|c| matches!(c, 'a'..='z' | '0'..='9' | '_')) { + return IdentKind::ParserName; + } + IdentKind::Invalid +} + +impl Parse for Ast { + fn parse(input: ParseStream) -> syn::Result { + let mut entry = None; + let mut skip = None; + let mut lexer_map = HashMap::new(); + let mut parser_map = HashMap::new(); + + while !input.is_empty() { + if input.peek(Token![%]) { + // parse keyword + input.parse::()?; + let ident = input.parse::()?; + match ident.to_string().as_str() { + "entry" => { + input.parse::()?; + entry = Some(input.parse::()?); + } + "skip" => { + input.parse::()?; + skip = Some(input.parse::()?); + } + _ => return Err(syn::Error::new(ident.span(), "invalid keyword")), + } + } else { + // parse lexer / parser definitions + let ident = input.parse::()?; + match ident_kind(&ident) { + IdentKind::LexerName => { + input.parse::()?; + lexer_map.insert(ident, input.parse::()?); + } + IdentKind::ParserName => { + parser_map.insert(ident, input.parse::()?); + } + _ => return Err(syn::Error::new(ident.span(), "invalid ident")), + } + } + input.parse::()?; + } + + Ok(Self { + entry: entry.ok_or_else(|| input.error("missing %entry"))?, + skip, + lexer_map, + parser_map, + }) + } +} + +impl Parse for ParserDef { + // (":" syn::Type)? = (ParserRule)|+ + fn parse(input: ParseStream) -> syn::Result { + let ty = match input.parse::() { + Ok(_) => input.parse::()?, + Err(_) => parse_quote!(&'src str), + }; + + input.parse::()?; + + // let mut rules = Vec::new(); + // loop { + // rules.push(input.parse::()?); + // if !input.peek(Token![|]) { + // break; + // } + // input.parse::(); + // } + + // TODO: check whether this is in-place + let rules = Punctuated::::parse_separated_nonempty(input)? + .into_iter() + .collect(); + + Ok(Self { ty, rules }) + } +} + +impl Parse for ParserRule { + // (ParserBinding)+ syn::Block? + fn parse(input: ParseStream) -> syn::Result { + let mut bindings = Vec::new(); + while !input.peek(syn::token::Brace) && !input.peek(Token![|]) && !input.peek(Token![;]) { + bindings.push(input.parse::()?); + } + + let mut action = None; + if input.peek(syn::token::Brace) { + action = Some(input.parse::()?); + } + + Ok(Self { bindings, action }) + } +} + +impl Parse for ParserBinding { + // ("$" syn::Ident ("<" syn::Type ">")? ":")? ParserTree + fn parse(input: ParseStream) -> syn::Result { + let mut name = None; + let mut ty = None; + + if input.peek(Token![$]) { + input.parse::()?; + name = Some(input.parse::()?); + + if input.peek(Token![<]) { + input.parse::()?; + ty = Some(input.parse::()?); + input.parse::]>()?; + } + + input.parse::()?; + } + + let tree = input.parse::()?; + + Ok(Self { name, ty, tree }) + } +} + +impl Parse for LexerTree { + // pratt parsing + fn parse(input: ParseStream) -> syn::Result { + todo!() + } +} + +impl Parse for ParserTree { + // pratt parsing + fn parse(input: ParseStream) -> syn::Result { + todo!() + } +} diff --git a/pag-parser2/src/lib.rs b/pag-parser2/src/lib.rs new file mode 100644 index 0000000..3602390 --- /dev/null +++ b/pag-parser2/src/lib.rs @@ -0,0 +1,9 @@ +// Copyright (c) 2023 Paguroidea Developers +// +// Licensed under the Apache License, Version 2.0 +// or the MIT +// license , at your +// option. All files in the project carrying such notice may not be copied, +// modified, or distributed except according to those terms. + +mod frontend; From 6386241c259102ef1909cef39570cbe68ba3bebe Mon Sep 17 00:00:00 2001 From: Schrodinger ZHU Yifan Date: Mon, 24 Jul 2023 23:54:05 -0400 Subject: [PATCH 05/42] initial work to add semact --- pag-parser2/Cargo.toml | 1 + pag-parser2/src/frontend/ast.rs | 8 +++--- pag-parser2/src/frontend/parse.rs | 4 +-- pag-parser2/src/lib.rs | 1 + pag-parser2/src/nf/mod.rs | 45 +++++++++++++++++++++++++++++++ pag-parser2/src/nf/semact.rs | 41 ++++++++++++++++++++++++++++ 6 files changed, 94 insertions(+), 6 deletions(-) create mode 100644 pag-parser2/src/nf/mod.rs create mode 100644 pag-parser2/src/nf/semact.rs diff --git a/pag-parser2/Cargo.toml b/pag-parser2/Cargo.toml index ce26614..925356d 100644 --- a/pag-parser2/Cargo.toml +++ b/pag-parser2/Cargo.toml @@ -24,3 +24,4 @@ readme.workspace = true [dependencies] syn = "2.0.27" +quote = "1.0.9" \ No newline at end of file diff --git a/pag-parser2/src/frontend/ast.rs b/pag-parser2/src/frontend/ast.rs index f591cbf..caadc3c 100644 --- a/pag-parser2/src/frontend/ast.rs +++ b/pag-parser2/src/frontend/ast.rs @@ -33,9 +33,9 @@ pub struct ParserBinding { // TODO: how to express "bottom" & "empty"? pub enum LexerTree { - Alt(Vec>), - Seq(Vec>), - And(Vec>), + Alt(Vec), + Seq(Vec), + And(Vec), Star(Box), Plus(Box), Opt(Box), @@ -47,7 +47,7 @@ pub enum LexerTree { // TODO: how to express "select" & "ignore"? pub enum ParserTree { - Seq(Vec>), + Seq(Vec), Star(Box), Plus(Box), Opt(Box), diff --git a/pag-parser2/src/frontend/parse.rs b/pag-parser2/src/frontend/parse.rs index 1eb6c61..8f81715 100644 --- a/pag-parser2/src/frontend/parse.rs +++ b/pag-parser2/src/frontend/parse.rs @@ -152,14 +152,14 @@ impl Parse for ParserBinding { impl Parse for LexerTree { // pratt parsing - fn parse(input: ParseStream) -> syn::Result { + fn parse(_input: ParseStream) -> syn::Result { todo!() } } impl Parse for ParserTree { // pratt parsing - fn parse(input: ParseStream) -> syn::Result { + fn parse(_input: ParseStream) -> syn::Result { todo!() } } diff --git a/pag-parser2/src/lib.rs b/pag-parser2/src/lib.rs index 3602390..11e33c1 100644 --- a/pag-parser2/src/lib.rs +++ b/pag-parser2/src/lib.rs @@ -7,3 +7,4 @@ // modified, or distributed except according to those terms. mod frontend; +mod nf; diff --git a/pag-parser2/src/nf/mod.rs b/pag-parser2/src/nf/mod.rs new file mode 100644 index 0000000..02a0655 --- /dev/null +++ b/pag-parser2/src/nf/mod.rs @@ -0,0 +1,45 @@ +use quote::format_ident; +use syn::Ident; +mod semact; + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub enum Tag { + Toplevel(Ident), + Anonymous(usize), +} + +impl Tag { + pub fn toplevel(ident: Ident) -> Self { + Self::Toplevel(ident) + } + pub fn anonymous(index: usize) -> Self { + Self::Anonymous(index) + } + /// Identifier of the parser routine. + pub fn parser_name(&self) -> Ident { + match self { + Self::Anonymous(index) => format_ident!("__anonymous_{}", index), + Self::Toplevel(ident) => format_ident!("parse_{}", ident), + } + } +} + +/// Action in the normal form. +/// If this subroutine's return value is taken, it should mark [`Action::output`] as `true`. +/// There is no need to assign an ident to a subroutine. As we are always +/// reducing from left to right, we maintain the context of which the current +/// semantic action to reduce, and always assign "__0", "__1", "__2". When a [`Reduce`] is +/// encountered, we start over from "__0". +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub enum Action { + Shift { + /// Parser routine to call. + tag: Tag, + output: bool, + }, + Reduce { + /// Reduction routine to call. + tag: Tag, + output: bool, + }, +} diff --git a/pag-parser2/src/nf/semact.rs b/pag-parser2/src/nf/semact.rs new file mode 100644 index 0000000..1673dae --- /dev/null +++ b/pag-parser2/src/nf/semact.rs @@ -0,0 +1,41 @@ +use std::collections::HashMap; + +use quote::format_ident; +use syn::{parse_quote, Expr, ExprCall, Stmt, Type}; + +use super::Tag; + +pub type SemActTable = HashMap; +pub struct SemAct { + /// Identifier of the semantic action routine. + function: Expr, + /// Type annotation + ty: Option, + /// Number of arguments + arity: usize, +} + +impl SemAct { + fn generate_call(&self) -> ExprCall { + let exprs = (0..self.arity).map(|i| format_ident!("__{}", i)); + let function = &self.function; + parse_quote!( + #function(#(#exprs),*) + ) + } + pub fn generate_statement(&self, output: Option) -> Stmt { + let expr = self.generate_call(); + match output { + None => parse_quote!( + #expr; + ), + Some(index) => { + let ty = self.ty.iter(); + let output = format_ident!("__{}", index); + parse_quote!( + let #output #(: #ty)* = #expr; + ) + } + } + } +} From 4e87b65295d874b5adde42687843908ee0f91c98 Mon Sep 17 00:00:00 2001 From: QuarticCat Date: Tue, 25 Jul 2023 17:28:54 +0800 Subject: [PATCH 06/42] minor update --- pag-parser/src/frontend/mod.rs | 4 ++- pag-parser/src/frontend/syntax.rs | 2 +- pag-parser/src/fusion.rs | 4 ++- pag-parser/src/nf.rs | 13 ++++++-- pag-parser/src/type_system/fixpoint.rs | 4 ++- pag-parser/src/utilities.rs | 4 ++- pag-parser2/Cargo.toml | 4 +-- pag-parser2/src/frontend/ast.rs | 16 +++++----- pag-parser2/src/frontend/parse.rs | 42 +++++++++++--------------- pag-parser2/src/nf/mod.rs | 8 +++++ pag-parser2/src/nf/semact.rs | 8 +++++ 11 files changed, 67 insertions(+), 42 deletions(-) diff --git a/pag-parser/src/frontend/mod.rs b/pag-parser/src/frontend/mod.rs index 8fa04b7..067624e 100644 --- a/pag-parser/src/frontend/mod.rs +++ b/pag-parser/src/frontend/mod.rs @@ -493,7 +493,9 @@ mod test { dbg!(size_of::()); let pairs = GrammarParser::parse(Rule::grammar, TEST).unwrap(); let tree = parse_surface_syntax(pairs, &PRATT_PARSER, TEST).unwrap(); - let Grammar { lexer, parser } = &tree.node else { unreachable!() }; + let Grammar { lexer, parser } = &tree.node else { + unreachable!() + }; println!("\n---------< construct lexer database >----------"); let database = construct_lexer_database(lexer).unwrap(); diff --git a/pag-parser/src/frontend/syntax.rs b/pag-parser/src/frontend/syntax.rs index 6119d92..b1ee998 100644 --- a/pag-parser/src/frontend/syntax.rs +++ b/pag-parser/src/frontend/syntax.rs @@ -71,7 +71,7 @@ pub fn construct_parser<'src, 'arena>( }; let mut errs = Vec::new(); for rule in rules { - let ParserRuleDef { active, name, expr, } = &rule.node else { + let ParserRuleDef { active, name, expr } = &rule.node else { unreachable_branch!("parser should only contain rule definitions") }; match construct_core_syntax_tree(&parser, expr) { diff --git a/pag-parser/src/fusion.rs b/pag-parser/src/fusion.rs index f8c12b2..a6a289b 100644 --- a/pag-parser/src/fusion.rs +++ b/pag-parser/src/fusion.rs @@ -199,7 +199,9 @@ fn generate_children<'src>( .iter() .filter(|x| !matches!(x, NormalForm::Empty(..))) .map(|nf| { - let NormalForm::Sequence { nonterminals, .. } = nf else { unreachable!() }; + let NormalForm::Sequence { nonterminals, .. } = nf else { + unreachable!() + }; let mut add_continue = false; let mut actions = Vec::new(); diff --git a/pag-parser/src/nf.rs b/pag-parser/src/nf.rs index 47433e8..3332861 100644 --- a/pag-parser/src/nf.rs +++ b/pag-parser/src/nf.rs @@ -297,7 +297,10 @@ pub fn merge_inactive_rules<'src, 'nf>( let NormalForm::Sequence { terminal, nonterminals, - } = j else { continue }; + } = j + else { + continue; + }; if nonterminals.contains(&Action::Subroutine(tag)) { *j = &*arena.alloc(NormalForm::Sequence { terminal: *terminal, @@ -328,9 +331,13 @@ pub fn remove_unreachable_rules<'src>(nfs: &mut NormalForms<'src, '_>, parser: & return; } visited.insert(current); - let Some(tag) = nfs.entries.get(¤t) else { return }; + let Some(tag) = nfs.entries.get(¤t) else { + return; + }; for i in tag { - let NormalForm::Sequence { nonterminals, .. } = i else { continue }; + let NormalForm::Sequence { nonterminals, .. } = i else { + continue; + }; for i in nonterminals { let Action::Subroutine(x) = i else { continue }; dfs(nfs, *x, visited); diff --git a/pag-parser/src/type_system/fixpoint.rs b/pag-parser/src/type_system/fixpoint.rs index a6e297a..3e4ca85 100644 --- a/pag-parser/src/type_system/fixpoint.rs +++ b/pag-parser/src/type_system/fixpoint.rs @@ -39,7 +39,9 @@ fn find_neighbors( Term::Fix(_, expr) => find_neighbors(expr, neighbors, sym_to_id), Term::ParserRef(symbol) => { // unexisted IDs refer to implicit fixpoints - let Some(&id) = sym_to_id.get(symbol) else { return }; + let Some(&id) = sym_to_id.get(symbol) else { + return; + }; neighbors.push(id); } _ => {} diff --git a/pag-parser/src/utilities.rs b/pag-parser/src/utilities.rs index c1beeca..487e503 100644 --- a/pag-parser/src/utilities.rs +++ b/pag-parser/src/utilities.rs @@ -43,7 +43,9 @@ fn is_ascii_ident_head(x: &u8) -> bool { } fn is_ascii_ident(s: &str) -> bool { - let [x, xs@..] = s.as_bytes() else { return false }; + let [x, xs @ ..] = s.as_bytes() else { + return false; + }; is_ascii_ident_head(x) && xs.iter().all(is_ascii_ident_body) } diff --git a/pag-parser2/Cargo.toml b/pag-parser2/Cargo.toml index 925356d..e9a6228 100644 --- a/pag-parser2/Cargo.toml +++ b/pag-parser2/Cargo.toml @@ -23,5 +23,5 @@ authors.workspace = true readme.workspace = true [dependencies] -syn = "2.0.27" -quote = "1.0.9" \ No newline at end of file +syn = { version = "2.0.27", features = ["full"] } +quote = "1.0.9" diff --git a/pag-parser2/src/frontend/ast.rs b/pag-parser2/src/frontend/ast.rs index caadc3c..f8dd109 100644 --- a/pag-parser2/src/frontend/ast.rs +++ b/pag-parser2/src/frontend/ast.rs @@ -10,8 +10,8 @@ use std::collections::HashMap; pub struct Ast { pub entry: syn::Ident, - pub skip: Option, - pub lexer_map: HashMap, + pub skip: Option, + pub lexer_map: HashMap, pub parser_map: HashMap, } @@ -21,18 +21,18 @@ pub struct ParserDef { } pub struct ParserRule { - pub bindings: Vec, + pub vars: Vec, pub action: Option, } -pub struct ParserBinding { +pub struct VarBinding { pub name: Option, pub ty: Option, - pub tree: ParserTree, + pub expr: ParserExpr, } -// TODO: how to express "bottom" & "empty"? -pub enum LexerTree { +// TODO: how to express "bottom" & "any"? +pub enum LexerExpr { Alt(Vec), Seq(Vec), And(Vec), @@ -46,7 +46,7 @@ pub enum LexerTree { } // TODO: how to express "select" & "ignore"? -pub enum ParserTree { +pub enum ParserExpr { Seq(Vec), Star(Box), Plus(Box), diff --git a/pag-parser2/src/frontend/parse.rs b/pag-parser2/src/frontend/parse.rs index 8f81715..23ae51b 100644 --- a/pag-parser2/src/frontend/parse.rs +++ b/pag-parser2/src/frontend/parse.rs @@ -9,7 +9,6 @@ use super::ast::*; use syn::parse::{Parse, ParseStream}; -use syn::punctuated::Punctuated; use syn::{parse_quote, Token}; use std::collections::HashMap; @@ -50,7 +49,7 @@ impl Parse for Ast { } "skip" => { input.parse::()?; - skip = Some(input.parse::()?); + skip = Some(input.parse::()?); } _ => return Err(syn::Error::new(ident.span(), "invalid keyword")), } @@ -60,7 +59,7 @@ impl Parse for Ast { match ident_kind(&ident) { IdentKind::LexerName => { input.parse::()?; - lexer_map.insert(ident, input.parse::()?); + lexer_map.insert(ident, input.parse::()?); } IdentKind::ParserName => { parser_map.insert(ident, input.parse::()?); @@ -90,19 +89,14 @@ impl Parse for ParserDef { input.parse::()?; - // let mut rules = Vec::new(); - // loop { - // rules.push(input.parse::()?); - // if !input.peek(Token![|]) { - // break; - // } - // input.parse::(); - // } - - // TODO: check whether this is in-place - let rules = Punctuated::::parse_separated_nonempty(input)? - .into_iter() - .collect(); + let mut rules = Vec::new(); + loop { + rules.push(input.parse::()?); + if !input.peek(Token![|]) { + break; + } + input.parse::()?; + } Ok(Self { ty, rules }) } @@ -111,9 +105,9 @@ impl Parse for ParserDef { impl Parse for ParserRule { // (ParserBinding)+ syn::Block? fn parse(input: ParseStream) -> syn::Result { - let mut bindings = Vec::new(); + let mut vars = Vec::new(); while !input.peek(syn::token::Brace) && !input.peek(Token![|]) && !input.peek(Token![;]) { - bindings.push(input.parse::()?); + vars.push(input.parse::()?); } let mut action = None; @@ -121,11 +115,11 @@ impl Parse for ParserRule { action = Some(input.parse::()?); } - Ok(Self { bindings, action }) + Ok(Self { vars, action }) } } -impl Parse for ParserBinding { +impl Parse for VarBinding { // ("$" syn::Ident ("<" syn::Type ">")? ":")? ParserTree fn parse(input: ParseStream) -> syn::Result { let mut name = None; @@ -144,20 +138,20 @@ impl Parse for ParserBinding { input.parse::()?; } - let tree = input.parse::()?; + let expr = input.parse::()?; - Ok(Self { name, ty, tree }) + Ok(Self { name, ty, expr }) } } -impl Parse for LexerTree { +impl Parse for LexerExpr { // pratt parsing fn parse(_input: ParseStream) -> syn::Result { todo!() } } -impl Parse for ParserTree { +impl Parse for ParserExpr { // pratt parsing fn parse(_input: ParseStream) -> syn::Result { todo!() diff --git a/pag-parser2/src/nf/mod.rs b/pag-parser2/src/nf/mod.rs index 02a0655..29d6c92 100644 --- a/pag-parser2/src/nf/mod.rs +++ b/pag-parser2/src/nf/mod.rs @@ -1,3 +1,11 @@ +// Copyright (c) 2023 Paguroidea Developers +// +// Licensed under the Apache License, Version 2.0 +// or the MIT +// license , at your +// option. All files in the project carrying such notice may not be copied, +// modified, or distributed except according to those terms. + use quote::format_ident; use syn::Ident; mod semact; diff --git a/pag-parser2/src/nf/semact.rs b/pag-parser2/src/nf/semact.rs index 1673dae..15d5c42 100644 --- a/pag-parser2/src/nf/semact.rs +++ b/pag-parser2/src/nf/semact.rs @@ -1,3 +1,11 @@ +// Copyright (c) 2023 Paguroidea Developers +// +// Licensed under the Apache License, Version 2.0 +// or the MIT +// license , at your +// option. All files in the project carrying such notice may not be copied, +// modified, or distributed except according to those terms. + use std::collections::HashMap; use quote::format_ident; From cc5b341ef728d646676d649fe5e3f7b7786675ed Mon Sep 17 00:00:00 2001 From: QuarticCat Date: Tue, 25 Jul 2023 19:34:11 +0800 Subject: [PATCH 07/42] impl LexerExpr::parse --- pag-parser2/src/frontend/ast.rs | 8 +- pag-parser2/src/frontend/parse.rs | 127 ++++++++++++++++++++++++++++-- 2 files changed, 126 insertions(+), 9 deletions(-) diff --git a/pag-parser2/src/frontend/ast.rs b/pag-parser2/src/frontend/ast.rs index f8dd109..a09b1df 100644 --- a/pag-parser2/src/frontend/ast.rs +++ b/pag-parser2/src/frontend/ast.rs @@ -33,9 +33,9 @@ pub struct VarBinding { // TODO: how to express "bottom" & "any"? pub enum LexerExpr { - Alt(Vec), - Seq(Vec), - And(Vec), + Alt(Box, Box), + Seq(Box, Box), + And(Box, Box), Star(Box), Plus(Box), Opt(Box), @@ -47,7 +47,7 @@ pub enum LexerExpr { // TODO: how to express "select" & "ignore"? pub enum ParserExpr { - Seq(Vec), + Seq(Box, Box), Star(Box), Plus(Box), Opt(Box), diff --git a/pag-parser2/src/frontend/parse.rs b/pag-parser2/src/frontend/parse.rs index 23ae51b..c80e68c 100644 --- a/pag-parser2/src/frontend/parse.rs +++ b/pag-parser2/src/frontend/parse.rs @@ -8,11 +8,13 @@ use super::ast::*; +use syn::ext::IdentExt; use syn::parse::{Parse, ParseStream}; -use syn::{parse_quote, Token}; +use syn::{parenthesized, parse_quote, Token}; use std::collections::HashMap; +#[derive(PartialEq, Eq)] enum IdentKind { LexerName, ParserName, @@ -20,7 +22,7 @@ enum IdentKind { } fn ident_kind(ident: &syn::Ident) -> IdentKind { - let s = ident.to_string(); // TODO: should we add a `.unraw()` ? + let s = ident.unraw().to_string(); if s.chars().all(|c| matches!(c, 'A'..='Z' | '0'..='9' | '_')) { return IdentKind::LexerName; } @@ -145,15 +147,130 @@ impl Parse for VarBinding { } impl Parse for LexerExpr { - // pratt parsing - fn parse(_input: ParseStream) -> syn::Result { - todo!() + fn parse(input: ParseStream) -> syn::Result { + parse_lexer_expr(input, 0) } } +// pratt parsing +fn parse_lexer_expr(input: ParseStream, min_bp: u32) -> syn::Result { + let mut lhs = 'lhs: { + if input.peek(syn::Ident) { + let ident = input.parse::()?; + if ident_kind(&ident) != IdentKind::LexerName { + return Err(syn::Error::new(ident.span(), "invalid ident")); + } + break 'lhs LexerExpr::Ref(ident); + } + if input.peek(syn::LitStr) { + let str = input.parse::()?; + break 'lhs LexerExpr::Str(str); + } + if input.peek(syn::LitChar) { + let l = input.parse::()?; + input.parse::()?; + let r = input.parse::()?; + break 'lhs LexerExpr::Range(l, r); + } + if input.peek(syn::token::Paren) { + let content; + parenthesized!(content in input); + break 'lhs content.parse::()?; + } + if input.peek(Token![!]) { + input.parse::()?; + let r_bp = 60; + let rhs = parse_lexer_expr(input, r_bp)?; + break 'lhs LexerExpr::Not(Box::new(rhs)); + } + return Err(input.error("expect lexer expression")); + }; + + loop { + if input.peek(Token![|]) { + let (l_bp, r_bp) = (30, 31); + if l_bp < min_bp { + break; + } + input.parse::()?; + let rhs = parse_lexer_expr(input, r_bp)?; + lhs = LexerExpr::Alt(Box::new(lhs), Box::new(rhs)); + continue; + } + if input.peek(syn::Ident) + || input.peek(syn::LitStr) + || input.peek(syn::LitChar) + || input.peek(syn::token::Paren) + || input.peek(syn::token::Paren) + || input.peek(Token![!]) + { + let (l_bp, r_bp) = (40, 41); + if l_bp < min_bp { + break; + } + let rhs = parse_lexer_expr(input, r_bp)?; + lhs = LexerExpr::Seq(Box::new(lhs), Box::new(rhs)); + continue; + } + if input.peek(Token![&]) { + let (l_bp, r_bp) = (50, 51); + if l_bp < min_bp { + break; + } + input.parse::()?; + let rhs = parse_lexer_expr(input, r_bp)?; + lhs = LexerExpr::And(Box::new(lhs), Box::new(rhs)); + continue; + } + if input.peek(Token![*]) { + let l_bp = 70; + if l_bp < min_bp { + break; + } + input.parse::()?; + lhs = LexerExpr::Star(Box::new(lhs)); + continue; + } + if input.peek(Token![+]) { + let l_bp = 80; + if l_bp < min_bp { + break; + } + input.parse::()?; + lhs = LexerExpr::Plus(Box::new(lhs)); + continue; + } + if input.peek(Token![?]) { + let l_bp = 90; + if l_bp < min_bp { + break; + } + input.parse::()?; + lhs = LexerExpr::Opt(Box::new(lhs)); + continue; + } + break; + } + + Ok(lhs) +} + impl Parse for ParserExpr { // pratt parsing fn parse(_input: ParseStream) -> syn::Result { todo!() } } + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_lexer_expr() { + syn::parse_str::(r#"("abc" 'a'..'z') r#A | B & C | D* E+ F? !G"#).unwrap(); + } + + #[test] + fn test_parser_expr() {} +} From 17561e830bcfc6408ebb8283973c4444b4079f41 Mon Sep 17 00:00:00 2001 From: QuarticCat Date: Tue, 25 Jul 2023 19:45:51 +0800 Subject: [PATCH 08/42] impl ParserExpr::parse --- pag-parser2/src/frontend/parse.rs | 76 +++++++++++++++++++++++++++---- 1 file changed, 67 insertions(+), 9 deletions(-) diff --git a/pag-parser2/src/frontend/parse.rs b/pag-parser2/src/frontend/parse.rs index c80e68c..0b0859e 100644 --- a/pag-parser2/src/frontend/parse.rs +++ b/pag-parser2/src/frontend/parse.rs @@ -22,7 +22,7 @@ enum IdentKind { } fn ident_kind(ident: &syn::Ident) -> IdentKind { - let s = ident.unraw().to_string(); + let s = ident.to_string(); if s.chars().all(|c| matches!(c, 'A'..='Z' | '0'..='9' | '_')) { return IdentKind::LexerName; } @@ -43,7 +43,7 @@ impl Parse for Ast { if input.peek(Token![%]) { // parse keyword input.parse::()?; - let ident = input.parse::()?; + let ident = input.parse::()?.unraw(); match ident.to_string().as_str() { "entry" => { input.parse::()?; @@ -57,7 +57,7 @@ impl Parse for Ast { } } else { // parse lexer / parser definitions - let ident = input.parse::()?; + let ident = input.parse::()?.unraw(); match ident_kind(&ident) { IdentKind::LexerName => { input.parse::()?; @@ -129,7 +129,7 @@ impl Parse for VarBinding { if input.peek(Token![$]) { input.parse::()?; - name = Some(input.parse::()?); + name = Some(input.parse::()?.unraw()); if input.peek(Token![<]) { input.parse::()?; @@ -156,7 +156,7 @@ impl Parse for LexerExpr { fn parse_lexer_expr(input: ParseStream, min_bp: u32) -> syn::Result { let mut lhs = 'lhs: { if input.peek(syn::Ident) { - let ident = input.parse::()?; + let ident = input.parse::()?.unraw(); if ident_kind(&ident) != IdentKind::LexerName { return Err(syn::Error::new(ident.span(), "invalid ident")); } @@ -256,10 +256,66 @@ fn parse_lexer_expr(input: ParseStream, min_bp: u32) -> syn::Result { } impl Parse for ParserExpr { - // pratt parsing - fn parse(_input: ParseStream) -> syn::Result { - todo!() + fn parse(input: ParseStream) -> syn::Result { + parse_parser_expr(input, 0) + } +} + +// pratt parsing +fn parse_parser_expr(input: ParseStream, min_bp: u32) -> syn::Result { + let mut lhs = 'lhs: { + if input.peek(syn::Ident) { + let ident = input.parse::()?.unraw(); + match ident_kind(&ident) { + IdentKind::LexerName => break 'lhs ParserExpr::LexerRef(ident), + IdentKind::ParserName => break 'lhs ParserExpr::ParserRef(ident), + _ => return Err(syn::Error::new(ident.span(), "invalid ident")), + } + } + return Err(input.error("expect lexer expression")); + }; + + loop { + if input.peek(syn::Ident) { + let (l_bp, r_bp) = (40, 41); + if l_bp < min_bp { + break; + } + let rhs = parse_parser_expr(input, r_bp)?; + lhs = ParserExpr::Seq(Box::new(lhs), Box::new(rhs)); + continue; + } + if input.peek(Token![*]) { + let l_bp = 70; + if l_bp < min_bp { + break; + } + input.parse::()?; + lhs = ParserExpr::Star(Box::new(lhs)); + continue; + } + if input.peek(Token![+]) { + let l_bp = 80; + if l_bp < min_bp { + break; + } + input.parse::()?; + lhs = ParserExpr::Plus(Box::new(lhs)); + continue; + } + if input.peek(Token![?]) { + let l_bp = 90; + if l_bp < min_bp { + break; + } + input.parse::()?; + lhs = ParserExpr::Opt(Box::new(lhs)); + continue; + } + break; } + + Ok(lhs) } #[cfg(test)] @@ -272,5 +328,7 @@ mod test { } #[test] - fn test_parser_expr() {} + fn test_parser_expr() { + syn::parse_str::(r#"A? b c* D+ F?"#).unwrap(); + } } From e7d40ff505e3c7798ee2ada6b8a824a6e4692a27 Mon Sep 17 00:00:00 2001 From: QuarticCat Date: Tue, 25 Jul 2023 20:13:43 +0800 Subject: [PATCH 09/42] finish new frontend parser --- pag-parser2/src/frontend/parse.rs | 39 ++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 8 deletions(-) diff --git a/pag-parser2/src/frontend/parse.rs b/pag-parser2/src/frontend/parse.rs index 0b0859e..0df6e2e 100644 --- a/pag-parser2/src/frontend/parse.rs +++ b/pag-parser2/src/frontend/parse.rs @@ -10,7 +10,7 @@ use super::ast::*; use syn::ext::IdentExt; use syn::parse::{Parse, ParseStream}; -use syn::{parenthesized, parse_quote, Token}; +use syn::{bracketed, parenthesized, parse_quote, Token}; use std::collections::HashMap; @@ -105,7 +105,7 @@ impl Parse for ParserDef { } impl Parse for ParserRule { - // (ParserBinding)+ syn::Block? + // (VarBinding)+ syn::Block? fn parse(input: ParseStream) -> syn::Result { let mut vars = Vec::new(); while !input.peek(syn::token::Brace) && !input.peek(Token![|]) && !input.peek(Token![;]) { @@ -122,19 +122,19 @@ impl Parse for ParserRule { } impl Parse for VarBinding { - // ("$" syn::Ident ("<" syn::Type ">")? ":")? ParserTree + // ("$" syn::Ident ("<" syn::Type ">")? ":")? ParserExpr fn parse(input: ParseStream) -> syn::Result { let mut name = None; let mut ty = None; if input.peek(Token![$]) { - input.parse::()?; + input.parse::()?; name = Some(input.parse::()?.unraw()); - if input.peek(Token![<]) { - input.parse::()?; - ty = Some(input.parse::()?); - input.parse::]>()?; + if input.peek(syn::token::Bracket) { + let content; + bracketed!(content in input); + ty = Some(content.parse::()?); } input.parse::()?; @@ -331,4 +331,27 @@ mod test { fn test_parser_expr() { syn::parse_str::(r#"A? b c* D+ F?"#).unwrap(); } + + #[test] + fn test_full() { + syn::parse_str::( + r#" + %entry = sexpr; + + BLANK = " "; + DIGIT = '0'..'9'; + ALPHA = 'a'..'z' | 'A'..'Z'; + LPAREN = "("; + RPAREN = ")"; + ATOM = ALPHA (ALPHA | DIGIT)*; + %skip = (BLANK | "\t" | "\n" | "\r")+; + + compound: SExp = LPAREN $sexp[Vec<_>]:sexp+ RPAREN { SExp::Compound(sexp) }; + atom : SExp = $atom:ATOM { SExp::Atom(atom) }; + sexp : SExp = compound + | atom; + "#, + ) + .unwrap(); + } } From 98d4da65587b84cb5e983d5ac9933a5fc379bfe1 Mon Sep 17 00:00:00 2001 From: QuarticCat Date: Tue, 25 Jul 2023 22:00:07 +0800 Subject: [PATCH 10/42] impl post-fix var binding --- pag-parser2/src/frontend/ast.rs | 2 +- pag-parser2/src/frontend/parse.rs | 37 ++++++++++++++++--------------- 2 files changed, 20 insertions(+), 19 deletions(-) diff --git a/pag-parser2/src/frontend/ast.rs b/pag-parser2/src/frontend/ast.rs index a09b1df..1b40038 100644 --- a/pag-parser2/src/frontend/ast.rs +++ b/pag-parser2/src/frontend/ast.rs @@ -26,9 +26,9 @@ pub struct ParserRule { } pub struct VarBinding { + pub expr: ParserExpr, pub name: Option, pub ty: Option, - pub expr: ParserExpr, } // TODO: how to express "bottom" & "any"? diff --git a/pag-parser2/src/frontend/parse.rs b/pag-parser2/src/frontend/parse.rs index 0df6e2e..a7078c5 100644 --- a/pag-parser2/src/frontend/parse.rs +++ b/pag-parser2/src/frontend/parse.rs @@ -122,27 +122,29 @@ impl Parse for ParserRule { } impl Parse for VarBinding { - // ("$" syn::Ident ("<" syn::Type ">")? ":")? ParserExpr + // ParserExpr ("[" syn::Ident (":" syn::Type)? "]")? fn parse(input: ParseStream) -> syn::Result { + let expr = input.parse::()?; + let mut name = None; let mut ty = None; - if input.peek(Token![$]) { - input.parse::()?; - name = Some(input.parse::()?.unraw()); + if input.peek(syn::token::Bracket) { + let content; + bracketed!(content in input); + name = Some(content.parse::()?.unraw()); - if input.peek(syn::token::Bracket) { - let content; - bracketed!(content in input); + if content.peek(Token![:]) { + content.parse::()?; ty = Some(content.parse::()?); } - input.parse::()?; + if !content.is_empty() { + return Err(content.error("expected `]`")); + } } - let expr = input.parse::()?; - - Ok(Self { name, ty, expr }) + Ok(Self { expr, name, ty }) } } @@ -183,7 +185,7 @@ fn parse_lexer_expr(input: ParseStream, min_bp: u32) -> syn::Result { let rhs = parse_lexer_expr(input, r_bp)?; break 'lhs LexerExpr::Not(Box::new(rhs)); } - return Err(input.error("expect lexer expression")); + return Err(input.error("expected lexer expression")); }; loop { @@ -272,7 +274,7 @@ fn parse_parser_expr(input: ParseStream, min_bp: u32) -> syn::Result _ => return Err(syn::Error::new(ident.span(), "invalid ident")), } } - return Err(input.error("expect lexer expression")); + return Err(input.error("expected parser expression")); }; loop { @@ -336,18 +338,17 @@ mod test { fn test_full() { syn::parse_str::( r#" - %entry = sexpr; + %entry = sexp; - BLANK = " "; DIGIT = '0'..'9'; ALPHA = 'a'..'z' | 'A'..'Z'; LPAREN = "("; RPAREN = ")"; ATOM = ALPHA (ALPHA | DIGIT)*; - %skip = (BLANK | "\t" | "\n" | "\r")+; + %skip = (" " | "\t" | "\n" | "\r")+; - compound: SExp = LPAREN $sexp[Vec<_>]:sexp+ RPAREN { SExp::Compound(sexp) }; - atom : SExp = $atom:ATOM { SExp::Atom(atom) }; + compound: SExp = LPAREN sexp+[sexp:Vec<_>] RPAREN { SExp::Compound(sexp) }; + atom : SExp = ATOM[atom] { SExp::Atom(atom) }; sexp : SExp = compound | atom; "#, From 4994ecb588325a83b6d43021a68a2a15e9e99d1b Mon Sep 17 00:00:00 2001 From: Schrodinger ZHU Yifan Date: Tue, 25 Jul 2023 15:38:42 -0400 Subject: [PATCH 11/42] add more design details of semact --- pag-parser2/src/nf/semact.rs | 123 +++++++++++++++++++++++++++-------- 1 file changed, 96 insertions(+), 27 deletions(-) diff --git a/pag-parser2/src/nf/semact.rs b/pag-parser2/src/nf/semact.rs index 15d5c42..bec6291 100644 --- a/pag-parser2/src/nf/semact.rs +++ b/pag-parser2/src/nf/semact.rs @@ -8,42 +8,111 @@ use std::collections::HashMap; -use quote::format_ident; -use syn::{parse_quote, Expr, ExprCall, Stmt, Type}; - use super::Tag; +use syn::{parse_quote, Expr, Type}; pub type SemActTable = HashMap; -pub struct SemAct { - /// Identifier of the semantic action routine. - function: Expr, - /// Type annotation - ty: Option, - /// Number of arguments - arity: usize, + +/// +/// ``` +/// trait Collector { +/// pub type Output; +/// fn finalize(self) -> Self::Output; +/// fn collect(&mut self, data: T); +/// } +/// +/// ``` +pub enum SemAct { + CustomizedRoutine { + /// Identifier of the semantic action routine. + function: Expr, + /// Type annotation + ret_type: Type, + /// Number of arguments + arity: usize, + }, + /// Specialized for the inner of @(@a, @b, @c). Return an Tuple of the inner routine. + Tuple, + /// Specialized for `inner?`. Return an Option of the inner routine + Option { inner_type: Type }, + /// Specialized for `i*` + /// Initialize a `Collector` (requires `Default + Collector`) and return the result from `Collector::finalize`. + ZeroOrMore { collector: Type }, + /// Specialized for `i+` = `i ~ i*`. + /// Initialize a `Collector` (requires `From + Collector`), pass it to the recursive routine + /// and return the result from `Collector::finalize`. + OneOrMoreToplevel { collector: Type }, + /// Specialized for `i+` = `i ~ i*`. + /// Accepts a `&mut Collector` + OneOrMoreNested { collector: Type }, } impl SemAct { - fn generate_call(&self) -> ExprCall { - let exprs = (0..self.arity).map(|i| format_ident!("__{}", i)); - let function = &self.function; - parse_quote!( - #function(#(#exprs),*) - ) - } - pub fn generate_statement(&self, output: Option) -> Stmt { - let expr = self.generate_call(); - match output { - None => parse_quote!( - #expr; - ), - Some(index) => { - let ty = self.ty.iter(); - let output = format_ident!("__{}", index); + /// Generate inlined expr for reduce action `terminal shift [reduce] shift shift` + pub fn generate_inline_expr<'a, I: IntoIterator>( + &self, + exprs: I, + delayed_func: Option, + ) -> Expr { + debug_assert_eq!( + delayed_func.is_some(), + matches!(self, Self::OneOrMoreToplevel { .. }) + ); + match self { + Self::CustomizedRoutine { + function, + ret_type: _, + arity: _, + } => { + let exprs = exprs.into_iter(); parse_quote!( - let #output #(: #ty)* = #expr; + #function(#(#exprs,)*) ) } + Self::Tuple => { + let exprs = exprs.into_iter(); + parse_quote!( + (#(#exprs,)*) + ) + } + Self::Option { .. } => { + unreachable!("Option can never be inlined, otherwise there is sequential ambiguity") + } + + Self::ZeroOrMore { .. } => unreachable!( + "ZeroOrMore can never be inlined, otherwise there is sequential ambiguity" + ), + + Self::OneOrMoreNested { .. } => unreachable!( + "OneOrMoreNested can never be inlined because it never appears in the first place" + ), + + Self::OneOrMoreToplevel { collector } => { + let exprs = exprs.into_iter(); + let delayed_func = delayed_func.unwrap(); + // TODO: src, offset + parse_quote! { + { + let mut collector = #collector::from(#(#exprs)*); + #delayed_func(&mut collector, src, offset); + collector.finalize() + } + } + } } } + + /// This function is useful in the following cases: + /// - If a shift routine is nested one or more, we does not emit the call to it immediately. Instead, we wait until + /// [`Self::generate_inlin_expr`] is called. + /// - If a parser routine has a semact [`Self::OneOrMoreNested`], it should be parametized by `C : Collector` in type + /// and its has `&mut C` as its first input param. + pub fn is_nested_one_or_more(&self) -> bool { + matches!(self, Self::OneOrMoreNested { .. }) + } + + /// Check if we should generate loops for TCO. + pub fn should_tco(&self) -> bool { + matches!(self, Self::ZeroOrMore { .. } | Self::OneOrMoreNested { .. }) + } } From a0608326046c4270f6438e52c512178f294ab179 Mon Sep 17 00:00:00 2001 From: Schrodinger ZHU Yifan Date: Tue, 25 Jul 2023 16:15:34 -0400 Subject: [PATCH 12/42] address comments on trait design --- pag-parser2/src/nf/semact.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pag-parser2/src/nf/semact.rs b/pag-parser2/src/nf/semact.rs index bec6291..e89c8c4 100644 --- a/pag-parser2/src/nf/semact.rs +++ b/pag-parser2/src/nf/semact.rs @@ -15,9 +15,7 @@ pub type SemActTable = HashMap; /// /// ``` -/// trait Collector { -/// pub type Output; -/// fn finalize(self) -> Self::Output; +/// trait Collector : Default { /// fn collect(&mut self, data: T); /// } /// @@ -104,7 +102,7 @@ impl SemAct { /// This function is useful in the following cases: /// - If a shift routine is nested one or more, we does not emit the call to it immediately. Instead, we wait until - /// [`Self::generate_inlin_expr`] is called. + /// [`Self::generate_inline_expr`] is called. /// - If a parser routine has a semact [`Self::OneOrMoreNested`], it should be parametized by `C : Collector` in type /// and its has `&mut C` as its first input param. pub fn is_nested_one_or_more(&self) -> bool { From b4706a21f241fb95ece9d6123087950bb8e6d782 Mon Sep 17 00:00:00 2001 From: QuarticCat Date: Wed, 26 Jul 2023 19:28:29 +0800 Subject: [PATCH 13/42] record lexer idx --- pag-parser2/src/frontend/ast.rs | 7 ++++- pag-parser2/src/frontend/parse.rs | 44 ++++++++++++++++++++----------- 2 files changed, 35 insertions(+), 16 deletions(-) diff --git a/pag-parser2/src/frontend/ast.rs b/pag-parser2/src/frontend/ast.rs index 1b40038..3215246 100644 --- a/pag-parser2/src/frontend/ast.rs +++ b/pag-parser2/src/frontend/ast.rs @@ -11,10 +11,15 @@ use std::collections::HashMap; pub struct Ast { pub entry: syn::Ident, pub skip: Option, - pub lexer_map: HashMap, + pub lexer_map: HashMap, pub parser_map: HashMap, } +pub struct LexerDef { + pub idx: u32, + pub expr: LexerExpr, +} + pub struct ParserDef { pub ty: syn::Type, pub rules: Vec, diff --git a/pag-parser2/src/frontend/parse.rs b/pag-parser2/src/frontend/parse.rs index a7078c5..fb4c924 100644 --- a/pag-parser2/src/frontend/parse.rs +++ b/pag-parser2/src/frontend/parse.rs @@ -10,7 +10,7 @@ use super::ast::*; use syn::ext::IdentExt; use syn::parse::{Parse, ParseStream}; -use syn::{bracketed, parenthesized, parse_quote, Token}; +use syn::{bracketed, parenthesized, parse_quote, Error, Result, Token}; use std::collections::HashMap; @@ -33,7 +33,7 @@ fn ident_kind(ident: &syn::Ident) -> IdentKind { } impl Parse for Ast { - fn parse(input: ParseStream) -> syn::Result { + fn parse(input: ParseStream) -> Result { let mut entry = None; let mut skip = None; let mut lexer_map = HashMap::new(); @@ -41,32 +41,46 @@ impl Parse for Ast { while !input.is_empty() { if input.peek(Token![%]) { - // parse keyword + // parse keywords input.parse::()?; let ident = input.parse::()?.unraw(); match ident.to_string().as_str() { "entry" => { + if entry.is_some() { + return Err(Error::new(ident.span(), "duplicate %entry definition")); + } input.parse::()?; entry = Some(input.parse::()?); } "skip" => { + if skip.is_some() { + return Err(Error::new(ident.span(), "duplicate %skip definition")); + } input.parse::()?; skip = Some(input.parse::()?); } - _ => return Err(syn::Error::new(ident.span(), "invalid keyword")), + _ => return Err(Error::new(ident.span(), "invalid keyword")), } } else { // parse lexer / parser definitions let ident = input.parse::()?.unraw(); match ident_kind(&ident) { IdentKind::LexerName => { + if lexer_map.contains_key(&ident) { + return Err(Error::new(ident.span(), "duplicate lexer definition")); + } input.parse::()?; - lexer_map.insert(ident, input.parse::()?); + let idx = lexer_map.len() as _; + let expr = input.parse::()?; + lexer_map.insert(ident, LexerDef { idx, expr }); } IdentKind::ParserName => { + if parser_map.contains_key(&ident) { + return Err(Error::new(ident.span(), "duplicate parser definition")); + } parser_map.insert(ident, input.parse::()?); } - _ => return Err(syn::Error::new(ident.span(), "invalid ident")), + _ => return Err(Error::new(ident.span(), "invalid ident")), } } input.parse::()?; @@ -83,7 +97,7 @@ impl Parse for Ast { impl Parse for ParserDef { // (":" syn::Type)? = (ParserRule)|+ - fn parse(input: ParseStream) -> syn::Result { + fn parse(input: ParseStream) -> Result { let ty = match input.parse::() { Ok(_) => input.parse::()?, Err(_) => parse_quote!(&'src str), @@ -106,7 +120,7 @@ impl Parse for ParserDef { impl Parse for ParserRule { // (VarBinding)+ syn::Block? - fn parse(input: ParseStream) -> syn::Result { + fn parse(input: ParseStream) -> Result { let mut vars = Vec::new(); while !input.peek(syn::token::Brace) && !input.peek(Token![|]) && !input.peek(Token![;]) { vars.push(input.parse::()?); @@ -123,7 +137,7 @@ impl Parse for ParserRule { impl Parse for VarBinding { // ParserExpr ("[" syn::Ident (":" syn::Type)? "]")? - fn parse(input: ParseStream) -> syn::Result { + fn parse(input: ParseStream) -> Result { let expr = input.parse::()?; let mut name = None; @@ -149,18 +163,18 @@ impl Parse for VarBinding { } impl Parse for LexerExpr { - fn parse(input: ParseStream) -> syn::Result { + fn parse(input: ParseStream) -> Result { parse_lexer_expr(input, 0) } } // pratt parsing -fn parse_lexer_expr(input: ParseStream, min_bp: u32) -> syn::Result { +fn parse_lexer_expr(input: ParseStream, min_bp: u32) -> Result { let mut lhs = 'lhs: { if input.peek(syn::Ident) { let ident = input.parse::()?.unraw(); if ident_kind(&ident) != IdentKind::LexerName { - return Err(syn::Error::new(ident.span(), "invalid ident")); + return Err(Error::new(ident.span(), "invalid ident")); } break 'lhs LexerExpr::Ref(ident); } @@ -258,20 +272,20 @@ fn parse_lexer_expr(input: ParseStream, min_bp: u32) -> syn::Result { } impl Parse for ParserExpr { - fn parse(input: ParseStream) -> syn::Result { + fn parse(input: ParseStream) -> Result { parse_parser_expr(input, 0) } } // pratt parsing -fn parse_parser_expr(input: ParseStream, min_bp: u32) -> syn::Result { +fn parse_parser_expr(input: ParseStream, min_bp: u32) -> Result { let mut lhs = 'lhs: { if input.peek(syn::Ident) { let ident = input.parse::()?.unraw(); match ident_kind(&ident) { IdentKind::LexerName => break 'lhs ParserExpr::LexerRef(ident), IdentKind::ParserName => break 'lhs ParserExpr::ParserRef(ident), - _ => return Err(syn::Error::new(ident.span(), "invalid ident")), + _ => return Err(Error::new(ident.span(), "invalid ident")), } } return Err(input.error("expected parser expression")); From f59d893492cf9386b9d98a15a5b7f6a1a3c16359 Mon Sep 17 00:00:00 2001 From: QuarticCat Date: Wed, 26 Jul 2023 21:32:39 +0800 Subject: [PATCH 14/42] support ignore in parser expr --- pag-parser2/src/frontend/ast.rs | 1 + pag-parser2/src/frontend/parse.rs | 18 +++++++++++++++++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/pag-parser2/src/frontend/ast.rs b/pag-parser2/src/frontend/ast.rs index 3215246..8de4969 100644 --- a/pag-parser2/src/frontend/ast.rs +++ b/pag-parser2/src/frontend/ast.rs @@ -58,4 +58,5 @@ pub enum ParserExpr { Opt(Box), LexerRef(syn::Ident), ParserRef(syn::Ident), + Ignore(Box), } diff --git a/pag-parser2/src/frontend/parse.rs b/pag-parser2/src/frontend/parse.rs index fb4c924..bf3bb53 100644 --- a/pag-parser2/src/frontend/parse.rs +++ b/pag-parser2/src/frontend/parse.rs @@ -288,11 +288,22 @@ fn parse_parser_expr(input: ParseStream, min_bp: u32) -> Result { _ => return Err(Error::new(ident.span(), "invalid ident")), } } + if input.peek(syn::token::Paren) { + let content; + parenthesized!(content in input); + break 'lhs content.parse::()?; + } + if input.peek(Token![#]) { + input.parse::()?; + let r_bp = 60; + let rhs = parse_parser_expr(input, r_bp)?; + break 'lhs ParserExpr::Ignore(Box::new(rhs)); + } return Err(input.error("expected parser expression")); }; loop { - if input.peek(syn::Ident) { + if input.peek(syn::Ident) || input.peek(Token![#]) { let (l_bp, r_bp) = (40, 41); if l_bp < min_bp { break; @@ -338,6 +349,11 @@ fn parse_parser_expr(input: ParseStream, min_bp: u32) -> Result { mod test { use super::*; + #[test] + fn test_var_binding() { + syn::parse_str::(r#"(#LPAREN expr #RPAREN)?[e]"#).unwrap(); + } + #[test] fn test_lexer_expr() { syn::parse_str::(r#"("abc" 'a'..'z') r#A | B & C | D* E+ F? !G"#).unwrap(); From 7faa80df5ccbe5de1adeeab240ae55818b43e5fa Mon Sep 17 00:00:00 2001 From: Schrodinger ZHU Yifan Date: Tue, 25 Jul 2023 20:10:06 -0400 Subject: [PATCH 15/42] add some debug facilities --- pag-parser2/Cargo.toml | 5 ++ pag-parser2/src/debug.rs | 22 +++++++++ pag-parser2/src/lib.rs | 2 + pag-parser2/src/nf/mod.rs | 101 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 130 insertions(+) create mode 100644 pag-parser2/src/debug.rs diff --git a/pag-parser2/Cargo.toml b/pag-parser2/Cargo.toml index e9a6228..0513c4c 100644 --- a/pag-parser2/Cargo.toml +++ b/pag-parser2/Cargo.toml @@ -22,6 +22,11 @@ rust-version.workspace = true authors.workspace = true readme.workspace = true +[features] +ansi-debug = ["nu-ansi-term", "debug"] +debug = [] + [dependencies] syn = { version = "2.0.27", features = ["full"] } quote = "1.0.9" +nu-ansi-term = { version = "0.49.0", optional = true } \ No newline at end of file diff --git a/pag-parser2/src/debug.rs b/pag-parser2/src/debug.rs new file mode 100644 index 0000000..fc99fdb --- /dev/null +++ b/pag-parser2/src/debug.rs @@ -0,0 +1,22 @@ +#[cfg(feature = "ansi-debug")] +macro_rules! styled { + ($style:expr, $($arg:tt)*) => { + { + use nu_ansi_term::*; + $style.paint(format!($($arg)*)) + } + }; +} +#[cfg(not(feature = "ansi-debug"))] +macro_rules! styled { + ($style:expr, $($arg:tt)*) => {format!($($arg)*)}; +} + +macro_rules! styled_write { + ($dst:expr, $($arg:tt)*) => { + write!($dst, "{}", $crate::debug::styled!($($arg)*)) + }; +} + +pub(crate) use styled; +pub(crate) use styled_write; diff --git a/pag-parser2/src/lib.rs b/pag-parser2/src/lib.rs index 11e33c1..85c0851 100644 --- a/pag-parser2/src/lib.rs +++ b/pag-parser2/src/lib.rs @@ -6,5 +6,7 @@ // option. All files in the project carrying such notice may not be copied, // modified, or distributed except according to those terms. +#[cfg(feature = "debug")] +mod debug; mod frontend; mod nf; diff --git a/pag-parser2/src/nf/mod.rs b/pag-parser2/src/nf/mod.rs index 29d6c92..10aa16e 100644 --- a/pag-parser2/src/nf/mod.rs +++ b/pag-parser2/src/nf/mod.rs @@ -8,6 +8,9 @@ use quote::format_ident; use syn::Ident; + +#[cfg(feature = "debug")] +use crate::debug::styled_write; mod semact; #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] @@ -32,6 +35,16 @@ impl Tag { } } +#[cfg(feature = "debug")] +impl std::fmt::Display for Tag { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Tag::Toplevel(ident) => write!(f, "{}", ident), + Tag::Anonymous(index) => write!(f, "{{{}}}", index), + } + } +} + /// Action in the normal form. /// If this subroutine's return value is taken, it should mark [`Action::output`] as `true`. /// There is no need to assign an ident to a subroutine. As we are always @@ -51,3 +64,91 @@ pub enum Action { output: bool, }, } + +#[cfg(feature = "debug")] +impl std::fmt::Display for Action { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Reduce { tag, output } => { + if *output { + styled_write!(f, Color::Red, "[{tag}]") + } else { + styled_write!(f, Color::Blue, "[{tag}]") + } + } + Self::Shift { tag, output } => { + if *output { + styled_write!(f, Color::Red, "{tag}") + } else { + styled_write!(f, Color::Blue, "{tag}") + } + } + } + } +} + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub enum NormalForm { + Empty(Vec<(Tag, bool)>), + Unexpanded(Vec), + Sequence(Ident, Vec), +} + +#[cfg(feature = "debug")] +impl std::fmt::Display for NormalForm { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Empty(actions) => { + write!(f, "ε")?; + for (tag, output) in actions.iter() { + if *output { + styled_write!(f, Color::Red, "[{tag}]")?; + } else { + styled_write!(f, Color::Blue, "[{tag}]")?; + } + } + } + Self::Unexpanded(actions) => { + write!(f, "{}", actions[0])?; + for action in &actions[1..] { + write!(f, " {}", action)?; + } + } + Self::Sequence(terminal, actions) => { + styled_write!(f, Color::Yellow.bold(), "{terminal}")?; + for action in actions.iter() { + write!(f, " {}", action)?; + } + } + } + Ok(()) + } +} + +#[cfg(all(feature = "debug", test))] +#[test] +fn debug_print_test() { + use quote::format_ident; + let sequence = NormalForm::Sequence( + format_ident!("TEST"), + vec![ + Action::Shift { + tag: Tag::Toplevel(format_ident!("a")), + output: false, + }, + Action::Reduce { + tag: Tag::Toplevel(format_ident!("b")), + output: true, + }, + Action::Shift { + tag: Tag::Toplevel(format_ident!("c")), + output: true, + }, + Action::Reduce { + tag: Tag::Anonymous(1), + output: false, + }, + ], + ); + println!("{}", sequence); +} From 550d0ee801958e511cd83d37e2f1a14be1623613 Mon Sep 17 00:00:00 2001 From: Schrodinger ZHU Yifan Date: Tue, 25 Jul 2023 21:16:32 -0400 Subject: [PATCH 16/42] print normal form table --- pag-parser2/Cargo.toml | 5 +- pag-parser2/src/nf/mod.rs | 108 ++++++++++++++++++++++++++++++++++---- 2 files changed, 100 insertions(+), 13 deletions(-) diff --git a/pag-parser2/Cargo.toml b/pag-parser2/Cargo.toml index 0513c4c..b6160ca 100644 --- a/pag-parser2/Cargo.toml +++ b/pag-parser2/Cargo.toml @@ -24,9 +24,10 @@ readme.workspace = true [features] ansi-debug = ["nu-ansi-term", "debug"] -debug = [] +debug = ["term_size"] [dependencies] syn = { version = "2.0.27", features = ["full"] } quote = "1.0.9" -nu-ansi-term = { version = "0.49.0", optional = true } \ No newline at end of file +nu-ansi-term = { version = "0.49.0", optional = true } +term_size = { version = "0.3", optional = true } \ No newline at end of file diff --git a/pag-parser2/src/nf/mod.rs b/pag-parser2/src/nf/mod.rs index 10aa16e..9f32362 100644 --- a/pag-parser2/src/nf/mod.rs +++ b/pag-parser2/src/nf/mod.rs @@ -6,9 +6,12 @@ // option. All files in the project carrying such notice may not be copied, // modified, or distributed except according to those terms. +use std::{collections::HashMap, ops::Deref}; + use quote::format_ident; use syn::Ident; +use crate::debug::styled; #[cfg(feature = "debug")] use crate::debug::styled_write; mod semact; @@ -39,8 +42,8 @@ impl Tag { impl std::fmt::Display for Tag { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { - Tag::Toplevel(ident) => write!(f, "{}", ident), - Tag::Anonymous(index) => write!(f, "{{{}}}", index), + Tag::Toplevel(ident) => write!(f, "{ident}"), + Tag::Anonymous(index) => styled_write!(f, Style::new().italic(), "_{index}"), } } } @@ -71,16 +74,16 @@ impl std::fmt::Display for Action { match self { Self::Reduce { tag, output } => { if *output { - styled_write!(f, Color::Red, "[{tag}]") + styled_write!(f, Color::Blue.underline(), "{tag}") } else { - styled_write!(f, Color::Blue, "[{tag}]") + styled_write!(f, Color::Blue, "{tag}") } } Self::Shift { tag, output } => { if *output { - styled_write!(f, Color::Red, "{tag}") + styled_write!(f, Color::Red.underline(), "{tag}") } else { - styled_write!(f, Color::Blue, "{tag}") + styled_write!(f, Color::Red, "{tag}") } } } @@ -102,22 +105,22 @@ impl std::fmt::Display for NormalForm { write!(f, "ε")?; for (tag, output) in actions.iter() { if *output { - styled_write!(f, Color::Red, "[{tag}]")?; + styled_write!(f, Color::Blue.underline(), "\t{tag}")?; } else { - styled_write!(f, Color::Blue, "[{tag}]")?; + styled_write!(f, Color::Blue, "\t{tag}")?; } } } Self::Unexpanded(actions) => { write!(f, "{}", actions[0])?; for action in &actions[1..] { - write!(f, " {}", action)?; + write!(f, "\t{}", action)?; } } Self::Sequence(terminal, actions) => { - styled_write!(f, Color::Yellow.bold(), "{terminal}")?; + styled_write!(f, Color::Yellow, "{terminal}")?; for action in actions.iter() { - write!(f, " {}", action)?; + write!(f, "\t{}", action)?; } } } @@ -152,3 +155,86 @@ fn debug_print_test() { ); println!("{}", sequence); } + +/// Well, it is not the notorius firewall. +pub struct NFTable(HashMap>); + +impl Deref for NFTable { + type Target = HashMap>; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +#[cfg(feature = "debug")] +impl std::fmt::Display for NFTable { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let width = term_size::dimensions().map(|x| x.0).unwrap_or(0); + writeln!(f, "┏{}┓", "━".repeat(width.saturating_sub(2)))?; + writeln!( + f, + "\t{}\t{}\t{}\t{}\n", + styled!(Color::Red.bold(), "Shift"), + styled!(Color::Blue.bold(), "Reduce"), + styled!(Style::new().underline().bold(), "Output"), + styled!(Style::new().italic().bold(), "Anonymous"), + )?; + for (tag, forms) in self.iter() { + writeln!( + f, + "\t{}\t=\t{}", + styled!(Style::new().underline(), "{tag}"), + forms[0] + )?; + for form in &forms[1..] { + writeln!(f, "\t\t|\t{}", form)?; + } + writeln!(f)?; + } + writeln!(f, "┗{}┛", "━".repeat(width.saturating_sub(2))) + } +} + +#[cfg(all(feature = "debug", test))] +#[test] +fn debug_print_nf_table() { + use quote::format_ident; + let sequence = NormalForm::Sequence( + format_ident!("TEST"), + vec![ + Action::Shift { + tag: Tag::Toplevel(format_ident!("a")), + output: false, + }, + Action::Reduce { + tag: Tag::Toplevel(format_ident!("b")), + output: true, + }, + Action::Shift { + tag: Tag::Toplevel(format_ident!("c")), + output: true, + }, + Action::Reduce { + tag: Tag::Anonymous(1), + output: false, + }, + ], + ); + let empty = NormalForm::Empty(vec![ + (Tag::Toplevel(format_ident!("a")), false), + (Tag::Toplevel(format_ident!("b")), true), + ]); + let table = NFTable( + vec![ + ( + Tag::Toplevel(format_ident!("TEST1")), + vec![sequence.clone(), empty.clone()], + ), + (Tag::Toplevel(format_ident!("TEST2")), vec![sequence, empty]), + ] + .into_iter() + .collect(), + ); + println!("{}", table); +} From 8b81928b7be5c19250619832bfc0928d2a94988a Mon Sep 17 00:00:00 2001 From: QuarticCat Date: Wed, 26 Jul 2023 21:53:16 +0800 Subject: [PATCH 17/42] fix use error --- pag-parser2/src/nf/mod.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pag-parser2/src/nf/mod.rs b/pag-parser2/src/nf/mod.rs index 9f32362..3f04e7f 100644 --- a/pag-parser2/src/nf/mod.rs +++ b/pag-parser2/src/nf/mod.rs @@ -6,15 +6,15 @@ // option. All files in the project carrying such notice may not be copied, // modified, or distributed except according to those terms. +mod semact; + use std::{collections::HashMap, ops::Deref}; use quote::format_ident; use syn::Ident; -use crate::debug::styled; #[cfg(feature = "debug")] -use crate::debug::styled_write; -mod semact; +use crate::debug::{styled, styled_write}; #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] pub enum Tag { From ce21ff6d9e4d3da4e980163bf7f4f0bbf754a2fc Mon Sep 17 00:00:00 2001 From: QuarticCat Date: Wed, 26 Jul 2023 22:09:28 +0800 Subject: [PATCH 18/42] fix parser bug --- pag-parser2/src/frontend/parse.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pag-parser2/src/frontend/parse.rs b/pag-parser2/src/frontend/parse.rs index bf3bb53..093842c 100644 --- a/pag-parser2/src/frontend/parse.rs +++ b/pag-parser2/src/frontend/parse.rs @@ -217,7 +217,6 @@ fn parse_lexer_expr(input: ParseStream, min_bp: u32) -> Result { || input.peek(syn::LitStr) || input.peek(syn::LitChar) || input.peek(syn::token::Paren) - || input.peek(syn::token::Paren) || input.peek(Token![!]) { let (l_bp, r_bp) = (40, 41); @@ -303,7 +302,7 @@ fn parse_parser_expr(input: ParseStream, min_bp: u32) -> Result { }; loop { - if input.peek(syn::Ident) || input.peek(Token![#]) { + if input.peek(syn::Ident) || input.peek(syn::token::Paren) || input.peek(Token![#]) { let (l_bp, r_bp) = (40, 41); if l_bp < min_bp { break; @@ -351,7 +350,7 @@ mod test { #[test] fn test_var_binding() { - syn::parse_str::(r#"(#LPAREN expr #RPAREN)?[e]"#).unwrap(); + syn::parse_str::(r#"(ident (#COLON expr)?)*[e]"#).unwrap(); } #[test] From 58ecd55a238a75aefc64528045dd98554ceb575e Mon Sep 17 00:00:00 2001 From: Schrodinger ZHU Yifan Date: Wed, 26 Jul 2023 16:52:17 -0400 Subject: [PATCH 19/42] add `HKT` support and type inference prototype --- pag-parser2/src/frontend/ast.rs | 9 +- pag-parser2/src/frontend/parse.rs | 24 +++- pag-parser2/src/nf/inference.rs | 214 ++++++++++++++++++++++++++++++ pag-parser2/src/nf/mod.rs | 77 ++++++++--- pag-parser2/src/nf/semact.rs | 102 +++----------- 5 files changed, 313 insertions(+), 113 deletions(-) create mode 100644 pag-parser2/src/nf/inference.rs diff --git a/pag-parser2/src/frontend/ast.rs b/pag-parser2/src/frontend/ast.rs index 8de4969..0290189 100644 --- a/pag-parser2/src/frontend/ast.rs +++ b/pag-parser2/src/frontend/ast.rs @@ -21,7 +21,7 @@ pub struct LexerDef { } pub struct ParserDef { - pub ty: syn::Type, + pub ty: TypeAnnotation, pub rules: Vec, } @@ -29,11 +29,16 @@ pub struct ParserRule { pub vars: Vec, pub action: Option, } +#[derive(Clone)] +pub enum TypeAnnotation { + Concrete(syn::Type), + HigherKind(syn::Path), +} pub struct VarBinding { pub expr: ParserExpr, pub name: Option, - pub ty: Option, + pub ty: Option, } // TODO: how to express "bottom" & "any"? diff --git a/pag-parser2/src/frontend/parse.rs b/pag-parser2/src/frontend/parse.rs index 093842c..43570f2 100644 --- a/pag-parser2/src/frontend/parse.rs +++ b/pag-parser2/src/frontend/parse.rs @@ -99,7 +99,7 @@ impl Parse for ParserDef { // (":" syn::Type)? = (ParserRule)|+ fn parse(input: ParseStream) -> Result { let ty = match input.parse::() { - Ok(_) => input.parse::()?, + Ok(_) => input.parse::()?, Err(_) => parse_quote!(&'src str), }; @@ -135,6 +135,18 @@ impl Parse for ParserRule { } } +impl Parse for TypeAnnotation { + fn parse(input: ParseStream) -> Result { + if input.peek(Token![@]) { + input.parse::()?; + let path = input.parse::()?; + Ok(Self::HigherKind(path)) + } else { + Ok(Self::Concrete(input.parse::()?)) + } + } +} + impl Parse for VarBinding { // ParserExpr ("[" syn::Ident (":" syn::Type)? "]")? fn parse(input: ParseStream) -> Result { @@ -150,7 +162,7 @@ impl Parse for VarBinding { if content.peek(Token![:]) { content.parse::()?; - ty = Some(content.parse::()?); + ty = Some(content.parse::()?); } if !content.is_empty() { @@ -363,6 +375,12 @@ mod test { syn::parse_str::(r#"A? b c* D+ F?"#).unwrap(); } + #[test] + fn test_parser_type_annotatopn() { + syn::parse_str::(r#"@Vec"#).unwrap(); + syn::parse_str::(r#"Vec"#).unwrap(); + } + #[test] fn test_full() { syn::parse_str::( @@ -376,7 +394,7 @@ mod test { ATOM = ALPHA (ALPHA | DIGIT)*; %skip = (" " | "\t" | "\n" | "\r")+; - compound: SExp = LPAREN sexp+[sexp:Vec<_>] RPAREN { SExp::Compound(sexp) }; + compound: SExp = LPAREN sexp+[sexp:@Vec] RPAREN { SExp::Compound(sexp) }; atom : SExp = ATOM[atom] { SExp::Atom(atom) }; sexp : SExp = compound | atom; diff --git a/pag-parser2/src/nf/inference.rs b/pag-parser2/src/nf/inference.rs new file mode 100644 index 0000000..b1f51e8 --- /dev/null +++ b/pag-parser2/src/nf/inference.rs @@ -0,0 +1,214 @@ +// If there is no semantic action, the routine is plain scan over. Thus, the type is unit. +// ⊢ x = ..., SemAct[x] = ∅ +// ------------------- +// ⊢ x : () + +// A Customized Routine must have type annotation +// ⊢ x = ..., SemAct[x] = Customized(𝜏) +// ------------------- +// ⊢ x : 𝜏 + +// A Token action gives the span of a terminal +// ⊢ x = T, SemAct[x] = Token +// ------------------- +// ⊢ x : Span + +// Fully normalized Option must be in the following form: +// x = T_0 ... [r_0] | T_1 ... [r_1] | .. | ε +// +// Thus, the rule should be: +// +// Γ ⊢ x = T_0 ... [r_0] | T_1 ... [r_1] | .. | ε +// Γ ⊢ r_0 : 𝜏_0, r_1 : 𝜏_1 ... +// Γ ⊢ 𝜏 = 𝜏_0 = 𝜏_1 = ... +// SemAct[x] = Option +// ------------------- +// Γ ⊢ x : Option<𝜏> + +// Fully normalized ZeroOrMore must be in the following form: +// x = T_0 ... [r_0] | T_1 ... [r_1] | .. | ε +// +// Thus, the rule should be: +// +// Γ ⊢ x = T_0 ... [r_0] | T_1 ... [r_1] | .. | ε +// Γ ⊢ r_0 : 𝜏_0, r_1 : 𝜏_1 ... +// Γ ⊢ 𝜏 = 𝜏_0 = 𝜏_1 =... +// SemAct[x] = ZeroOrMore(Σ ∈ Collector<𝜏>) +// ------------------- +// Γ ⊢ x : Σ + +// Fully normalized OneOrMoreToplevel must be in the following form: +// x = T_0 ...[r_0] t | T_1 ... [r_1] t | .. +// +// Thus, the rule should be: +// +// Γ ⊢ x = T_0 ...[r_0] t | T_1 ... [r_1] t | .. +// Γ ⊢ r_0 : 𝜏_0, r_1 : 𝜏_1 ... +// Γ ⊢ 𝜏 = 𝜏_0 = 𝜏_1 = ... +// SemAct[x] = OneOrMoreToplevel +// ------------------- +// Γ ⊢ x : Σ + +// Fully normalized OneOrMoreNested must be in the following form: +// x = T_0 ... [r_0] | T_1 ... [r_1] | .. | ε +// +// Thus, the rule should be: +// +// Γ ⊢ x = T_0 ... [r_0] | T_1 ... [r_1] | .. | ε +// Γ ⊢ r_0 : 𝜏_0, r_1 : 𝜏_1 ... +// Γ ⊢ 𝜏 = 𝜏_0 = 𝜏_1 =... +// SemAct[x] = ZeroOrMore +// ------------------- +// Γ ⊢ x : () -- Notice that x accept &mut C ∈ Collector<𝜏> instead + +// Fully normalized Tuple must be in the following form: +// x = T_0 ... [r_0] x00 _x01 x02 | .. +// let η_i be the type tuple of everything including last reduce that gives an output. +// x = T_0 ... [r_0] x00 _x01 x02 | .. +// Γ ⊢ ║ η_0 ║ = ║ η_1 ║ = ... +// Γ ⊢ ∀i.∀j.∀k. η_i.k = η_j.k +// SemAct[x] = Gather +// ------------------- +// Γ ⊢ x : η + +use std::{ + cell::UnsafeCell, + collections::{hash_map::Entry, HashMap}, +}; + +use syn::{parse_quote, Type}; + +use crate::{frontend::TypeAnnotation}; + +use super::{ + semact::{SemAct, SemActTable}, + NormalForm, Tag, +}; + +pub struct InferenceContext<'a> { + /// Typed tags + gamma: UnsafeCell>, + /// Type annotations from user + annotations: &'a HashMap, + /// Semantic action table + semact: &'a SemActTable, + /// Fully normalized terms + nforms: &'a HashMap>, +} +impl<'a> InferenceContext<'a> { + /// Create a new inference context + pub fn new( + annotations: &'a HashMap, + semact: &'a SemActTable, + nforms: &'a HashMap>, + ) -> Self { + Self { + gamma: UnsafeCell::new(HashMap::new()), + annotations, + semact, + nforms, + } + } + fn infer_gather<'i, I: Iterator>(&self, mut tags: I) -> Option { + if let Some(tag) = tags.next() { + let mut types = vec![self.infer(tag)?]; + for t in tags { + // If any inference fails, the whole inference fails + let ty = self.infer(t)?; + types.push(ty); + } + if types.len() == 1 { + // If there is only one field, no need to wrap in a tuple + Some(types.pop().unwrap()) + } else { + // Otherwise, wrap in a tuple + Some(parse_quote!((#(#types),*))) + } + } else { + // no field, unit type + Some(parse_quote!(())) + } + } + fn infer(&self, tag: &Tag) -> Option { + match unsafe { (*self.gamma.get()).entry(tag.clone()) } { + // If a tag has been inferred, return its type directly + Entry::Occupied(entry) => Some(entry.get().clone()), + Entry::Vacant(slot) => Some( + slot.insert({ + // If a concrete type annotation is provided, use it directly + if let Some(x) = self.annotations.get(tag).and_then(|anno| match anno { + TypeAnnotation::Concrete(ty) => Some(ty.clone()), + _ => None, + }) { + x + } else { + let semact = self.semact.get(tag); + match semact { + // No semantic action, the type is unit + None => parse_quote!(()), + // Token semantic action, the type is Span + Some(SemAct::Token) => parse_quote!(::pag_runtime::Span<'src>), + // Customized routine without type annotation -- inference failed + Some(SemAct::CustomizedRoutine(..)) => return None, + // Nested routine for one or more, the type is unit. + Some(SemAct::OneOrMoreNested) => parse_quote!(()), + Some(SemAct::Gather) => { + let nfs = self.nforms.get(tag)?; + let mut inferred = None; + // find first subexpression that fulfills inference + for i in nfs.iter() { + let visible = i.visible_bindings(0); + if let Some(gather_type) = + self.infer_gather(visible.into_iter().map(|x| x.1)) + { + inferred.replace(gather_type); + break; + } + } + inferred? + } + Some(SemAct::ZeroOrMore) | Some(SemAct::Option) | Some(SemAct::OneOrMoreToplevel) => { + let nfs = self.nforms.get(tag)?; + let TypeAnnotation::HigherKind(path) = self + .annotations.get(tag).cloned().unwrap_or_else(|| + if matches!(semact, Some(SemAct::Option)) { + TypeAnnotation::HigherKind(parse_quote!(::std::option::Option)) + } else { + TypeAnnotation::HigherKind(parse_quote!(::std::collections::VecDeque)) }) + else { unreachable!("must be higher kind type") }; + let mut inferred = None; + // find first subexpression that fulfills inference + for i in nfs.iter() { + // Skip epsilon production, this is safe since OneOrMoreToplevel will never be empty + if let NormalForm::Empty(x) = i { + if x.is_empty() { + continue; + } + } + // skip the trailing part of OneOrMoreToplevel + let visible = i.visible_bindings( + if matches!(semact, Some(SemAct::OneOrMoreToplevel)) { + 1 + } else { + 0 + }, + ); + if let Some(gather_type) = + self.infer_gather(visible.into_iter().map(|x| x.1)) + { + inferred.replace( + parse_quote!(#path<#gather_type>), + ); + break; + } + } + inferred? + } + } + } + }) + .clone(), + ), + } + } +} diff --git a/pag-parser2/src/nf/mod.rs b/pag-parser2/src/nf/mod.rs index 3f04e7f..214d2fb 100644 --- a/pag-parser2/src/nf/mod.rs +++ b/pag-parser2/src/nf/mod.rs @@ -6,9 +6,13 @@ // option. All files in the project carrying such notice may not be copied, // modified, or distributed except according to those terms. +mod inference; mod semact; -use std::{collections::HashMap, ops::Deref}; +use std::{ + collections::{HashMap, VecDeque}, + ops::{Deref}, +}; use quote::format_ident; use syn::Ident; @@ -59,12 +63,12 @@ pub enum Action { Shift { /// Parser routine to call. tag: Tag, - output: bool, + output: Option, }, Reduce { /// Reduction routine to call. tag: Tag, - output: bool, + output: Option, }, } @@ -73,15 +77,15 @@ impl std::fmt::Display for Action { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Self::Reduce { tag, output } => { - if *output { - styled_write!(f, Color::Blue.underline(), "{tag}") + if let Some(name) = output { + styled_write!(f, Color::Blue, "{tag}[{name}]") } else { styled_write!(f, Color::Blue, "{tag}") } } Self::Shift { tag, output } => { - if *output { - styled_write!(f, Color::Red.underline(), "{tag}") + if let Some(name) = output { + styled_write!(f, Color::Red, "{tag}[{name}]") } else { styled_write!(f, Color::Red, "{tag}") } @@ -92,11 +96,42 @@ impl std::fmt::Display for Action { #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] pub enum NormalForm { - Empty(Vec<(Tag, bool)>), + Empty(Vec<(Tag, Option)>), Unexpanded(Vec), Sequence(Ident, Vec), } +impl NormalForm { + pub fn visible_bindings(&self, skip: usize) -> Vec<(&Ident, &Tag)> { + match self { + Self::Empty(actions) => actions + .last() + .and_then(|(tag, ident)| Some((ident.as_ref()?, tag))) + .into_iter() + .collect(), + Self::Unexpanded(actions) | Self::Sequence(_, actions) => { + let mut acc = VecDeque::new(); + for act in actions.iter().rev().skip(skip) { + match act { + Action::Shift { tag, output } => { + if let Some(ident) = output { + acc.push_front((ident, tag)); + } + } + Action::Reduce { tag, output } => { + if let Some(ident) = output { + acc.push_front((ident, tag)); + } + break; + } + } + } + acc.into_iter().collect() + } + } + } +} + #[cfg(feature = "debug")] impl std::fmt::Display for NormalForm { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { @@ -104,8 +139,8 @@ impl std::fmt::Display for NormalForm { Self::Empty(actions) => { write!(f, "ε")?; for (tag, output) in actions.iter() { - if *output { - styled_write!(f, Color::Blue.underline(), "\t{tag}")?; + if let Some(name) = output { + styled_write!(f, Color::Blue, "\t{tag}[{name}]")?; } else { styled_write!(f, Color::Blue, "\t{tag}")?; } @@ -137,19 +172,19 @@ fn debug_print_test() { vec![ Action::Shift { tag: Tag::Toplevel(format_ident!("a")), - output: false, + output: None, }, Action::Reduce { tag: Tag::Toplevel(format_ident!("b")), - output: true, + output: Some(format_ident!("x")), }, Action::Shift { tag: Tag::Toplevel(format_ident!("c")), - output: true, + output: Some(format_ident!("y")), }, Action::Reduce { tag: Tag::Anonymous(1), - output: false, + output: None, }, ], ); @@ -177,7 +212,7 @@ impl std::fmt::Display for NFTable { "\t{}\t{}\t{}\t{}\n", styled!(Color::Red.bold(), "Shift"), styled!(Color::Blue.bold(), "Reduce"), - styled!(Style::new().underline().bold(), "Output"), + styled!(Style::new().bold(), "[Output]"), styled!(Style::new().italic().bold(), "Anonymous"), )?; for (tag, forms) in self.iter() { @@ -205,25 +240,25 @@ fn debug_print_nf_table() { vec![ Action::Shift { tag: Tag::Toplevel(format_ident!("a")), - output: false, + output: None, }, Action::Reduce { tag: Tag::Toplevel(format_ident!("b")), - output: true, + output: Some(format_ident!("x")), }, Action::Shift { tag: Tag::Toplevel(format_ident!("c")), - output: true, + output: Some(format_ident!("y")), }, Action::Reduce { tag: Tag::Anonymous(1), - output: false, + output: None, }, ], ); let empty = NormalForm::Empty(vec![ - (Tag::Toplevel(format_ident!("a")), false), - (Tag::Toplevel(format_ident!("b")), true), + (Tag::Toplevel(format_ident!("a")), None), + (Tag::Toplevel(format_ident!("b")), Some(format_ident!("x"))), ]); let table = NFTable( vec![ diff --git a/pag-parser2/src/nf/semact.rs b/pag-parser2/src/nf/semact.rs index e89c8c4..0e137ad 100644 --- a/pag-parser2/src/nf/semact.rs +++ b/pag-parser2/src/nf/semact.rs @@ -9,7 +9,7 @@ use std::collections::HashMap; use super::Tag; -use syn::{parse_quote, Expr, Type}; + pub type SemActTable = HashMap; @@ -20,97 +20,25 @@ pub type SemActTable = HashMap; /// } /// /// ``` + +// those normal form without SemAct will be treated as plain scanner. pub enum SemAct { - CustomizedRoutine { - /// Identifier of the semantic action routine. - function: Expr, - /// Type annotation - ret_type: Type, - /// Number of arguments - arity: usize, - }, - /// Specialized for the inner of @(@a, @b, @c). Return an Tuple of the inner routine. - Tuple, + CustomizedRoutine(syn::Block), + /// Gather inner data. If multiple is selected, return a tuple. + /// If only one is selected, return target data. + Gather, /// Specialized for `inner?`. Return an Option of the inner routine - Option { inner_type: Type }, + Option, /// Specialized for `i*` - /// Initialize a `Collector` (requires `Default + Collector`) and return the result from `Collector::finalize`. - ZeroOrMore { collector: Type }, + /// Initialize a `Collector` (requires `Collector`) and return the result from `Collector::finalize`. + ZeroOrMore, /// Specialized for `i+` = `i ~ i*`. - /// Initialize a `Collector` (requires `From + Collector`), pass it to the recursive routine + /// Initialize a `Collector` (requires `Collector`), pass it to the recursive routine /// and return the result from `Collector::finalize`. - OneOrMoreToplevel { collector: Type }, + OneOrMoreToplevel, /// Specialized for `i+` = `i ~ i*`. /// Accepts a `&mut Collector` - OneOrMoreNested { collector: Type }, -} - -impl SemAct { - /// Generate inlined expr for reduce action `terminal shift [reduce] shift shift` - pub fn generate_inline_expr<'a, I: IntoIterator>( - &self, - exprs: I, - delayed_func: Option, - ) -> Expr { - debug_assert_eq!( - delayed_func.is_some(), - matches!(self, Self::OneOrMoreToplevel { .. }) - ); - match self { - Self::CustomizedRoutine { - function, - ret_type: _, - arity: _, - } => { - let exprs = exprs.into_iter(); - parse_quote!( - #function(#(#exprs,)*) - ) - } - Self::Tuple => { - let exprs = exprs.into_iter(); - parse_quote!( - (#(#exprs,)*) - ) - } - Self::Option { .. } => { - unreachable!("Option can never be inlined, otherwise there is sequential ambiguity") - } - - Self::ZeroOrMore { .. } => unreachable!( - "ZeroOrMore can never be inlined, otherwise there is sequential ambiguity" - ), - - Self::OneOrMoreNested { .. } => unreachable!( - "OneOrMoreNested can never be inlined because it never appears in the first place" - ), - - Self::OneOrMoreToplevel { collector } => { - let exprs = exprs.into_iter(); - let delayed_func = delayed_func.unwrap(); - // TODO: src, offset - parse_quote! { - { - let mut collector = #collector::from(#(#exprs)*); - #delayed_func(&mut collector, src, offset); - collector.finalize() - } - } - } - } - } - - /// This function is useful in the following cases: - /// - If a shift routine is nested one or more, we does not emit the call to it immediately. Instead, we wait until - /// [`Self::generate_inline_expr`] is called. - /// - If a parser routine has a semact [`Self::OneOrMoreNested`], it should be parametized by `C : Collector` in type - /// and its has `&mut C` as its first input param. - pub fn is_nested_one_or_more(&self) -> bool { - matches!(self, Self::OneOrMoreNested { .. }) - } - - /// Check if we should generate loops for TCO. - pub fn should_tco(&self) -> bool { - matches!(self, Self::ZeroOrMore { .. } | Self::OneOrMoreNested { .. }) - } + OneOrMoreNested, + /// Yield a token span, + Token, } From 203739ef0cda248f3367a0764f83c87e6fdc9949 Mon Sep 17 00:00:00 2001 From: Schrodinger ZHU Yifan Date: Thu, 27 Jul 2023 01:00:17 -0400 Subject: [PATCH 20/42] expose infer_all_type interface --- pag-parser2/src/nf/inference.rs | 15 ++++++++++++++- pag-parser2/src/nf/mod.rs | 2 +- pag-parser2/src/nf/semact.rs | 1 - 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/pag-parser2/src/nf/inference.rs b/pag-parser2/src/nf/inference.rs index b1f51e8..98b7b0c 100644 --- a/pag-parser2/src/nf/inference.rs +++ b/pag-parser2/src/nf/inference.rs @@ -78,7 +78,7 @@ use std::{ use syn::{parse_quote, Type}; -use crate::{frontend::TypeAnnotation}; +use crate::frontend::TypeAnnotation; use super::{ semact::{SemAct, SemActTable}, @@ -129,6 +129,19 @@ impl<'a> InferenceContext<'a> { Some(parse_quote!(())) } } + /// try infer all types, but may fail with incomplete type information. + pub fn infer_all_types(mut self) -> HashMap { + let mut typed = 0; + while typed < self.nforms.len() { + typed = 0; + for i in self.nforms.keys() { + if self.infer(i).is_some() { + typed += 1; + } + } + } + std::mem::take(self.gamma.get_mut()) + } fn infer(&self, tag: &Tag) -> Option { match unsafe { (*self.gamma.get()).entry(tag.clone()) } { // If a tag has been inferred, return its type directly diff --git a/pag-parser2/src/nf/mod.rs b/pag-parser2/src/nf/mod.rs index 214d2fb..c26aa3f 100644 --- a/pag-parser2/src/nf/mod.rs +++ b/pag-parser2/src/nf/mod.rs @@ -11,7 +11,7 @@ mod semact; use std::{ collections::{HashMap, VecDeque}, - ops::{Deref}, + ops::Deref, }; use quote::format_ident; diff --git a/pag-parser2/src/nf/semact.rs b/pag-parser2/src/nf/semact.rs index 0e137ad..0dfd64b 100644 --- a/pag-parser2/src/nf/semact.rs +++ b/pag-parser2/src/nf/semact.rs @@ -10,7 +10,6 @@ use std::collections::HashMap; use super::Tag; - pub type SemActTable = HashMap; /// From d2d45b85cc28938fbdb7310fefa50cdd497af2f3 Mon Sep 17 00:00:00 2001 From: Schrodinger ZHU Yifan Date: Thu, 27 Jul 2023 03:04:35 -0400 Subject: [PATCH 21/42] address QC's method --- pag-parser2/src/frontend/ast.rs | 9 +---- pag-parser2/src/frontend/parse.rs | 24 ++--------- pag-parser2/src/nf/inference.rs | 67 +++++++++++++++++-------------- pag-parser2/src/nf/mod.rs | 2 +- 4 files changed, 43 insertions(+), 59 deletions(-) diff --git a/pag-parser2/src/frontend/ast.rs b/pag-parser2/src/frontend/ast.rs index 0290189..8de4969 100644 --- a/pag-parser2/src/frontend/ast.rs +++ b/pag-parser2/src/frontend/ast.rs @@ -21,7 +21,7 @@ pub struct LexerDef { } pub struct ParserDef { - pub ty: TypeAnnotation, + pub ty: syn::Type, pub rules: Vec, } @@ -29,16 +29,11 @@ pub struct ParserRule { pub vars: Vec, pub action: Option, } -#[derive(Clone)] -pub enum TypeAnnotation { - Concrete(syn::Type), - HigherKind(syn::Path), -} pub struct VarBinding { pub expr: ParserExpr, pub name: Option, - pub ty: Option, + pub ty: Option, } // TODO: how to express "bottom" & "any"? diff --git a/pag-parser2/src/frontend/parse.rs b/pag-parser2/src/frontend/parse.rs index 43570f2..093842c 100644 --- a/pag-parser2/src/frontend/parse.rs +++ b/pag-parser2/src/frontend/parse.rs @@ -99,7 +99,7 @@ impl Parse for ParserDef { // (":" syn::Type)? = (ParserRule)|+ fn parse(input: ParseStream) -> Result { let ty = match input.parse::() { - Ok(_) => input.parse::()?, + Ok(_) => input.parse::()?, Err(_) => parse_quote!(&'src str), }; @@ -135,18 +135,6 @@ impl Parse for ParserRule { } } -impl Parse for TypeAnnotation { - fn parse(input: ParseStream) -> Result { - if input.peek(Token![@]) { - input.parse::()?; - let path = input.parse::()?; - Ok(Self::HigherKind(path)) - } else { - Ok(Self::Concrete(input.parse::()?)) - } - } -} - impl Parse for VarBinding { // ParserExpr ("[" syn::Ident (":" syn::Type)? "]")? fn parse(input: ParseStream) -> Result { @@ -162,7 +150,7 @@ impl Parse for VarBinding { if content.peek(Token![:]) { content.parse::()?; - ty = Some(content.parse::()?); + ty = Some(content.parse::()?); } if !content.is_empty() { @@ -375,12 +363,6 @@ mod test { syn::parse_str::(r#"A? b c* D+ F?"#).unwrap(); } - #[test] - fn test_parser_type_annotatopn() { - syn::parse_str::(r#"@Vec"#).unwrap(); - syn::parse_str::(r#"Vec"#).unwrap(); - } - #[test] fn test_full() { syn::parse_str::( @@ -394,7 +376,7 @@ mod test { ATOM = ALPHA (ALPHA | DIGIT)*; %skip = (" " | "\t" | "\n" | "\r")+; - compound: SExp = LPAREN sexp+[sexp:@Vec] RPAREN { SExp::Compound(sexp) }; + compound: SExp = LPAREN sexp+[sexp:Vec<_>] RPAREN { SExp::Compound(sexp) }; atom : SExp = ATOM[atom] { SExp::Atom(atom) }; sexp : SExp = compound | atom; diff --git a/pag-parser2/src/nf/inference.rs b/pag-parser2/src/nf/inference.rs index 98b7b0c..75dd0a3 100644 --- a/pag-parser2/src/nf/inference.rs +++ b/pag-parser2/src/nf/inference.rs @@ -78,18 +78,24 @@ use std::{ use syn::{parse_quote, Type}; -use crate::frontend::TypeAnnotation; - use super::{ semact::{SemAct, SemActTable}, NormalForm, Tag, }; +#[derive(Clone)] +pub enum InferredType { + Concrete(Type), + Collector(Box), + Option(Box), + Tuple(Vec), +} + pub struct InferenceContext<'a> { /// Typed tags - gamma: UnsafeCell>, - /// Type annotations from user - annotations: &'a HashMap, + gamma: UnsafeCell>, + /// Type annotations from user (for toplevel) + annotations: &'a HashMap, /// Semantic action table semact: &'a SemActTable, /// Fully normalized terms @@ -98,7 +104,7 @@ pub struct InferenceContext<'a> { impl<'a> InferenceContext<'a> { /// Create a new inference context pub fn new( - annotations: &'a HashMap, + annotations: &'a HashMap, semact: &'a SemActTable, nforms: &'a HashMap>, ) -> Self { @@ -109,7 +115,7 @@ impl<'a> InferenceContext<'a> { nforms, } } - fn infer_gather<'i, I: Iterator>(&self, mut tags: I) -> Option { + fn infer_gather<'i, I: Iterator>(&self, mut tags: I) -> Option { if let Some(tag) = tags.next() { let mut types = vec![self.infer(tag)?]; for t in tags { @@ -122,15 +128,15 @@ impl<'a> InferenceContext<'a> { Some(types.pop().unwrap()) } else { // Otherwise, wrap in a tuple - Some(parse_quote!((#(#types),*))) + Some(InferredType::Tuple(types)) } } else { // no field, unit type - Some(parse_quote!(())) + Some(InferredType::Concrete(parse_quote! {()})) } } /// try infer all types, but may fail with incomplete type information. - pub fn infer_all_types(mut self) -> HashMap { + pub fn infer_all_types(mut self) -> HashMap { let mut typed = 0; while typed < self.nforms.len() { typed = 0; @@ -142,29 +148,30 @@ impl<'a> InferenceContext<'a> { } std::mem::take(self.gamma.get_mut()) } - fn infer(&self, tag: &Tag) -> Option { + fn infer(&self, tag: &Tag) -> Option { match unsafe { (*self.gamma.get()).entry(tag.clone()) } { // If a tag has been inferred, return its type directly Entry::Occupied(entry) => Some(entry.get().clone()), Entry::Vacant(slot) => Some( slot.insert({ // If a concrete type annotation is provided, use it directly - if let Some(x) = self.annotations.get(tag).and_then(|anno| match anno { - TypeAnnotation::Concrete(ty) => Some(ty.clone()), - _ => None, - }) { - x + if let Some(x) = self.annotations.get(tag) { + InferredType::Concrete(x.clone()) } else { let semact = self.semact.get(tag); match semact { // No semantic action, the type is unit - None => parse_quote!(()), + None => InferredType::Concrete(parse_quote!(())), // Token semantic action, the type is Span - Some(SemAct::Token) => parse_quote!(::pag_runtime::Span<'src>), + Some(SemAct::Token) => { + InferredType::Concrete(parse_quote!(::pag_runtime::Span<'src>)) + } // Customized routine without type annotation -- inference failed Some(SemAct::CustomizedRoutine(..)) => return None, // Nested routine for one or more, the type is unit. - Some(SemAct::OneOrMoreNested) => parse_quote!(()), + Some(SemAct::OneOrMoreNested) => { + InferredType::Concrete(parse_quote!(())) + } Some(SemAct::Gather) => { let nfs = self.nforms.get(tag)?; let mut inferred = None; @@ -180,15 +187,17 @@ impl<'a> InferenceContext<'a> { } inferred? } - Some(SemAct::ZeroOrMore) | Some(SemAct::Option) | Some(SemAct::OneOrMoreToplevel) => { + Some(SemAct::ZeroOrMore) + | Some(SemAct::Option) + | Some(SemAct::OneOrMoreToplevel) => { let nfs = self.nforms.get(tag)?; - let TypeAnnotation::HigherKind(path) = self - .annotations.get(tag).cloned().unwrap_or_else(|| - if matches!(semact, Some(SemAct::Option)) { - TypeAnnotation::HigherKind(parse_quote!(::std::option::Option)) - } else { - TypeAnnotation::HigherKind(parse_quote!(::std::collections::VecDeque)) }) - else { unreachable!("must be higher kind type") }; + let mapper = |ty: InferredType| { + if matches!(semact, Some(SemAct::Option)) { + InferredType::Option(Box::new(ty.clone())) + } else { + InferredType::Collector(Box::new(ty.clone())) + } + }; let mut inferred = None; // find first subexpression that fulfills inference for i in nfs.iter() { @@ -209,9 +218,7 @@ impl<'a> InferenceContext<'a> { if let Some(gather_type) = self.infer_gather(visible.into_iter().map(|x| x.1)) { - inferred.replace( - parse_quote!(#path<#gather_type>), - ); + inferred.replace(mapper(gather_type)); break; } } diff --git a/pag-parser2/src/nf/mod.rs b/pag-parser2/src/nf/mod.rs index c26aa3f..ea57f14 100644 --- a/pag-parser2/src/nf/mod.rs +++ b/pag-parser2/src/nf/mod.rs @@ -11,7 +11,7 @@ mod semact; use std::{ collections::{HashMap, VecDeque}, - ops::Deref, + ops::{ControlFlow, Deref}, }; use quote::format_ident; From e670899ec84889f2085d3a5a9714d0dd6191e476 Mon Sep 17 00:00:00 2001 From: Schrodinger ZHU Yifan Date: Thu, 27 Jul 2023 09:28:43 -0400 Subject: [PATCH 22/42] never coding again in the midnight --- pag-parser2/src/nf/inference.rs | 163 ++++++++++++++++---------------- pag-parser2/src/nf/mod.rs | 2 +- 2 files changed, 81 insertions(+), 84 deletions(-) diff --git a/pag-parser2/src/nf/inference.rs b/pag-parser2/src/nf/inference.rs index 75dd0a3..7838780 100644 --- a/pag-parser2/src/nf/inference.rs +++ b/pag-parser2/src/nf/inference.rs @@ -71,10 +71,7 @@ // ------------------- // Γ ⊢ x : η -use std::{ - cell::UnsafeCell, - collections::{hash_map::Entry, HashMap}, -}; +use std::collections::{HashMap}; use syn::{parse_quote, Type}; @@ -93,7 +90,7 @@ pub enum InferredType { pub struct InferenceContext<'a> { /// Typed tags - gamma: UnsafeCell>, + gamma: HashMap, /// Type annotations from user (for toplevel) annotations: &'a HashMap, /// Semantic action table @@ -109,13 +106,16 @@ impl<'a> InferenceContext<'a> { nforms: &'a HashMap>, ) -> Self { Self { - gamma: UnsafeCell::new(HashMap::new()), + gamma: HashMap::new(), annotations, semact, nforms, } } - fn infer_gather<'i, I: Iterator>(&self, mut tags: I) -> Option { + fn infer_gather<'i, I: Iterator>( + &mut self, + mut tags: I, + ) -> Option { if let Some(tag) = tags.next() { let mut types = vec![self.infer(tag)?]; for t in tags { @@ -146,89 +146,86 @@ impl<'a> InferenceContext<'a> { } } } - std::mem::take(self.gamma.get_mut()) + self.gamma } - fn infer(&self, tag: &Tag) -> Option { - match unsafe { (*self.gamma.get()).entry(tag.clone()) } { - // If a tag has been inferred, return its type directly - Entry::Occupied(entry) => Some(entry.get().clone()), - Entry::Vacant(slot) => Some( - slot.insert({ - // If a concrete type annotation is provided, use it directly - if let Some(x) = self.annotations.get(tag) { - InferredType::Concrete(x.clone()) - } else { - let semact = self.semact.get(tag); - match semact { - // No semantic action, the type is unit - None => InferredType::Concrete(parse_quote!(())), - // Token semantic action, the type is Span - Some(SemAct::Token) => { - InferredType::Concrete(parse_quote!(::pag_runtime::Span<'src>)) + fn infer(&mut self, tag: &Tag) -> Option { + if let Some(x) = self.gamma.get(tag) { + return Some(x.clone()); + } + let target = + // If a concrete type annotation is provided, use it directly + if let Some(x) = self.annotations.get(tag) { + InferredType::Concrete(x.clone()) + } else { + let semact = self.semact.get(tag); + match semact { + // No semantic action, the type is unit + None => InferredType::Concrete(parse_quote!(())), + // Token semantic action, the type is Span + Some(SemAct::Token) => { + InferredType::Concrete(parse_quote!(::pag_runtime::Span<'src>)) + } + // Customized routine without type annotation -- inference failed + Some(SemAct::CustomizedRoutine(..)) => return None, + // Nested routine for one or more, the type is unit. + Some(SemAct::OneOrMoreNested) => { + InferredType::Concrete(parse_quote!(())) + } + Some(SemAct::Gather) => { + let nfs = self.nforms.get(tag)?; + let mut inferred = None; + // find first subexpression that fulfills inference + for i in nfs.iter() { + let visible = i.visible_bindings(0); + if let Some(gather_type) = + self.infer_gather(visible.into_iter().map(|x| x.1)) + { + inferred.replace(gather_type); + break; } - // Customized routine without type annotation -- inference failed - Some(SemAct::CustomizedRoutine(..)) => return None, - // Nested routine for one or more, the type is unit. - Some(SemAct::OneOrMoreNested) => { - InferredType::Concrete(parse_quote!(())) + } + inferred? + } + Some(SemAct::ZeroOrMore) + | Some(SemAct::Option) + | Some(SemAct::OneOrMoreToplevel) => { + let nfs = self.nforms.get(tag)?; + let mapper = |ty: InferredType| { + if matches!(semact, Some(SemAct::Option)) { + InferredType::Option(Box::new(ty)) + } else { + InferredType::Collector(Box::new(ty)) } - Some(SemAct::Gather) => { - let nfs = self.nforms.get(tag)?; - let mut inferred = None; - // find first subexpression that fulfills inference - for i in nfs.iter() { - let visible = i.visible_bindings(0); - if let Some(gather_type) = - self.infer_gather(visible.into_iter().map(|x| x.1)) - { - inferred.replace(gather_type); - break; - } + }; + let mut inferred = None; + // find first subexpression that fulfills inference + for i in nfs.iter() { + // Skip epsilon production, this is safe since OneOrMoreToplevel will never be empty + if let NormalForm::Empty(x) = i { + if x.is_empty() { + continue; } - inferred? } - Some(SemAct::ZeroOrMore) - | Some(SemAct::Option) - | Some(SemAct::OneOrMoreToplevel) => { - let nfs = self.nforms.get(tag)?; - let mapper = |ty: InferredType| { - if matches!(semact, Some(SemAct::Option)) { - InferredType::Option(Box::new(ty.clone())) - } else { - InferredType::Collector(Box::new(ty.clone())) - } - }; - let mut inferred = None; - // find first subexpression that fulfills inference - for i in nfs.iter() { - // Skip epsilon production, this is safe since OneOrMoreToplevel will never be empty - if let NormalForm::Empty(x) = i { - if x.is_empty() { - continue; - } - } - // skip the trailing part of OneOrMoreToplevel - let visible = i.visible_bindings( - if matches!(semact, Some(SemAct::OneOrMoreToplevel)) { - 1 - } else { - 0 - }, - ); - if let Some(gather_type) = - self.infer_gather(visible.into_iter().map(|x| x.1)) - { - inferred.replace(mapper(gather_type)); - break; - } - } - inferred? + // skip the trailing part of OneOrMoreToplevel + let visible = i.visible_bindings( + if matches!(semact, Some(SemAct::OneOrMoreToplevel)) { + 1 + } else { + 0 + }, + ); + if let Some(gather_type) = + self.infer_gather(visible.into_iter().map(|x| x.1)) + { + inferred.replace(mapper(gather_type)); + break; } } + inferred? } - }) - .clone(), - ), - } + } + }; + self.gamma.insert(tag.clone(), target.clone()); + Some(target) } } diff --git a/pag-parser2/src/nf/mod.rs b/pag-parser2/src/nf/mod.rs index ea57f14..c26aa3f 100644 --- a/pag-parser2/src/nf/mod.rs +++ b/pag-parser2/src/nf/mod.rs @@ -11,7 +11,7 @@ mod semact; use std::{ collections::{HashMap, VecDeque}, - ops::{ControlFlow, Deref}, + ops::Deref, }; use quote::format_ident; From 9654bfc60ca8395ecd4f3f6a350636623af8dad0 Mon Sep 17 00:00:00 2001 From: QuarticCat Date: Thu, 27 Jul 2023 02:52:49 +0800 Subject: [PATCH 23/42] add simd tail handling --- pag-lexer/src/lookahead.rs | 42 ++++++++++++++++++++------------------ pag-lexer/src/vector.rs | 8 ++++++-- 2 files changed, 28 insertions(+), 22 deletions(-) diff --git a/pag-lexer/src/lookahead.rs b/pag-lexer/src/lookahead.rs index 52b236d..61e93d8 100644 --- a/pag-lexer/src/lookahead.rs +++ b/pag-lexer/src/lookahead.rs @@ -33,17 +33,13 @@ fn generate_lut_routine(index: usize) -> TokenStream { fn byte_simd(byte: u8) -> TokenStream { let byte = byte_char(byte); - quote! { - data.simd_eq(u8x16::splat(#byte)) - } + quote! { data.simd_eq(u8x16::splat(#byte)) } } fn range_simd(min: u8, max: u8) -> TokenStream { let min = byte_char(min); let max = byte_char(max); - quote! { - data.simd_ge(u8x16::splat(#min)) & data.simd_le(u8x16::splat(#max)) - } + quote! { data.simd_ge(u8x16::splat(#min)) & data.simd_le(u8x16::splat(#max)) } } fn generate_lookahead_routine(intervals: &Intervals, kind: Kind) -> TokenStream { @@ -60,25 +56,31 @@ fn generate_lookahead_routine(intervals: &Intervals, kind: Kind) -> TokenStream .reduce(|acc, x| quote! { #acc | #x }) .map(|x| { if cfg!(target_arch = "aarch64") { - quote! {{ - let mask : u128 = unsafe { core::mem::transmute(#x) }; - mask.#count_act() / 8 - }} + quote! { unsafe { core::mem::transmute::<_, u128>(#x).#count_act() / 8 } } } else { - quote! { - (#x).to_bitmask().#count_act() - } + quote! { (#x).to_bitmask().#count_act() } } }); + let tail_act = match kind { + Kind::Positive => quote! { + while matches!(input.get(idx), Some(#intervals)) { idx += 1; } + }, + Kind::Negative => quote! { + while !matches!(input.get(idx), Some(#intervals) | None) { idx += 1; } + }, + }; quote! { - for i in input[idx..].array_chunks::<16>() { - use core::simd::*; - let data = u8x16::from_slice(i); - let idx_offset = #idx_offset; - idx += idx_offset as usize; - if core::intrinsics::unlikely(idx_offset != 16) { - break; + 'lookahead: { + for i in input[idx..].array_chunks::<16>() { + use core::simd::*; + let data = u8x16::from_slice(i); + let idx_offset = #idx_offset; + idx += idx_offset as usize; + if core::intrinsics::unlikely(idx_offset != 16) { + break 'lookahead; + } } + #tail_act } } } diff --git a/pag-lexer/src/vector.rs b/pag-lexer/src/vector.rs index c513811..61488ae 100644 --- a/pag-lexer/src/vector.rs +++ b/pag-lexer/src/vector.rs @@ -151,16 +151,20 @@ impl Vector { }, }; } + let lookahead = optimizer.generate_lookahead(&dfa, state); let transitions = info.transitions.iter().map(|(interval, target)| { if leaf_states.contains(target) { let rule_idx = target.last_success.unwrap(); let on_success = &success_actions[rule_idx]; return quote! { Some(#interval) => { cursor = idx + 1; #on_success }, }; } - let target_label = format_ident!("S{}", dfa[target].state_id); + let target_id = dfa[target].state_id; + if lookahead.is_some() && info.state_id == target_id { + return quote! {}; + } + let target_label = format_ident!("S{}", target_id); quote! { Some(#interval) => state = State::#target_label, } }); - let lookahead = optimizer.generate_lookahead(&dfa, state); let otherwise = state .last_success .and_then(|x| success_actions.get(x)) From e0622ee22250d8ddee76704c90f4a5df9c2d188e Mon Sep 17 00:00:00 2001 From: QuarticCat Date: Thu, 27 Jul 2023 03:12:09 +0800 Subject: [PATCH 24/42] adjust lookahead generation --- README.md | 2 -- benches/csv/src/lib.rs | 3 +-- benches/json/src/lib.rs | 3 +-- pag-lexer/src/lib.rs | 3 +-- pag-lexer/src/lookahead.rs | 44 +++++++++++++++---------------- pag-parser/src/fusion.rs | 2 +- tests/arith-expr/src/lib.rs | 3 +-- tests/sexpr-calculator/src/lib.rs | 3 +-- tests/tokenizer/src/lib.rs | 3 +-- 9 files changed, 29 insertions(+), 37 deletions(-) diff --git a/README.md b/README.md index 44cf57e..1aa2db3 100644 --- a/README.md +++ b/README.md @@ -121,8 +121,6 @@ For some reasons (mostly performance issues), only nightly rust (1.71+) is suppo should be annotated with ```rust #![feature(portable_simd)] -#![feature(core_intrinsics)] -#![feature(array_chunks)] ``` diff --git a/benches/csv/src/lib.rs b/benches/csv/src/lib.rs index a598426..d0b5b15 100644 --- a/benches/csv/src/lib.rs +++ b/benches/csv/src/lib.rs @@ -1,6 +1,5 @@ #![feature(portable_simd)] -#![feature(core_intrinsics)] -#![feature(array_chunks)] + mod parser; pub use parser::parse; diff --git a/benches/json/src/lib.rs b/benches/json/src/lib.rs index 92b0dbe..aa04492 100644 --- a/benches/json/src/lib.rs +++ b/benches/json/src/lib.rs @@ -1,6 +1,5 @@ #![feature(portable_simd)] -#![feature(core_intrinsics)] -#![feature(array_chunks)] + mod parser; pub use parser::parse; diff --git a/pag-lexer/src/lib.rs b/pag-lexer/src/lib.rs index ab93457..45a0c1c 100644 --- a/pag-lexer/src/lib.rs +++ b/pag-lexer/src/lib.rs @@ -5,9 +5,8 @@ // license , at your // option. All files in the project carrying such notice may not be copied, // modified, or distributed except according to those terms. + #![feature(portable_simd)] -#![feature(core_intrinsics)] -#![feature(array_chunks)] pub mod congruence; pub mod derivative; diff --git a/pag-lexer/src/lookahead.rs b/pag-lexer/src/lookahead.rs index 61e93d8..8f6c825 100644 --- a/pag-lexer/src/lookahead.rs +++ b/pag-lexer/src/lookahead.rs @@ -61,26 +61,24 @@ fn generate_lookahead_routine(intervals: &Intervals, kind: Kind) -> TokenStream quote! { (#x).to_bitmask().#count_act() } } }); - let tail_act = match kind { - Kind::Positive => quote! { - while matches!(input.get(idx), Some(#intervals)) { idx += 1; } - }, - Kind::Negative => quote! { - while !matches!(input.get(idx), Some(#intervals) | None) { idx += 1; } - }, - }; + let tail_match = match kind { + Kind::Positive => quote! { matches!(input.get(idx), Some(#intervals)) }, + Kind::Negative => quote! { !matches!(input.get(idx), Some(#intervals) | None) }, + }; quote! { 'lookahead: { - for i in input[idx..].array_chunks::<16>() { + for chunk in input[idx..].chunks_exact(16) { use core::simd::*; - let data = u8x16::from_slice(i); + let data = u8x16::from_slice(chunk); let idx_offset = #idx_offset; idx += idx_offset as usize; - if core::intrinsics::unlikely(idx_offset != 16) { + if idx_offset != 16 { break 'lookahead; } } - #tail_act + while #tail_match { + idx += 1; + } } } } @@ -141,20 +139,22 @@ impl LoopOptimizer { } pub fn generate_lookahead(&mut self, dfa: &DfaTable, state: &DfaState) -> Option { - let limit = 4; + let limit = 8; let positives = direct_self_loops(dfa, state)?; - if estimated_cost(&positives) <= limit { - return Some(generate_lookahead_routine(&positives, Kind::Positive)); - } - let negatives = positives.complement()?; - if estimated_cost(&negatives) <= limit { - return Some(generate_lookahead_routine(&negatives, Kind::Negative)); - } + let pos_cost = estimated_cost(&positives); + let neg_cost = estimated_cost(&negatives); - let index = self.assign_table(&negatives); - Some(generate_lut_routine(index)) + if pos_cost.min(neg_cost) > limit { + let index = self.assign_table(&negatives); + return Some(generate_lut_routine(index)); + } + if pos_cost < neg_cost { + Some(generate_lookahead_routine(&positives, Kind::Positive)) + } else { + Some(generate_lookahead_routine(&negatives, Kind::Negative)) + } } } diff --git a/pag-parser/src/fusion.rs b/pag-parser/src/fusion.rs index a6a289b..12bef15 100644 --- a/pag-parser/src/fusion.rs +++ b/pag-parser/src/fusion.rs @@ -96,7 +96,7 @@ fn generate_error() -> TokenStream { let expect = match self.expecting { [head] => head.to_string(), [init @ .., last] => format!("{} or {last}", init.join(", ")), - _ => unsafe { std::intrinsics::unreachable() }, + _ => unsafe { std::hint::unreachable_unchecked() }, }; write!( f, diff --git a/tests/arith-expr/src/lib.rs b/tests/arith-expr/src/lib.rs index f6def5e..c59f0e4 100644 --- a/tests/arith-expr/src/lib.rs +++ b/tests/arith-expr/src/lib.rs @@ -1,6 +1,5 @@ #![feature(portable_simd)] -#![feature(core_intrinsics)] -#![feature(array_chunks)] + use std::num::Wrapping; mod parser; diff --git a/tests/sexpr-calculator/src/lib.rs b/tests/sexpr-calculator/src/lib.rs index e3a768c..ee7bfc0 100644 --- a/tests/sexpr-calculator/src/lib.rs +++ b/tests/sexpr-calculator/src/lib.rs @@ -1,6 +1,5 @@ #![feature(portable_simd)] -#![feature(core_intrinsics)] -#![feature(array_chunks)] + use std::num::Wrapping; mod parser; diff --git a/tests/tokenizer/src/lib.rs b/tests/tokenizer/src/lib.rs index 0b27ef4..650fedf 100644 --- a/tests/tokenizer/src/lib.rs +++ b/tests/tokenizer/src/lib.rs @@ -1,6 +1,5 @@ #![feature(portable_simd)] -#![feature(core_intrinsics)] -#![feature(array_chunks)] + mod comment_and_string; mod common_prefix; mod generated; From 2eaac6d7bb0819081251e1f657da5a5e441e6821 Mon Sep 17 00:00:00 2001 From: QuarticCat Date: Thu, 27 Jul 2023 23:01:29 +0800 Subject: [PATCH 25/42] try to solve aarch64 performance regression --- benches/json/Cargo.toml | 2 +- pag-lexer/src/lookahead.rs | 84 +++++++++++++++++++++++++------------- pag-lexer/src/vector.rs | 1 + 3 files changed, 58 insertions(+), 29 deletions(-) diff --git a/benches/json/Cargo.toml b/benches/json/Cargo.toml index c72e9d6..84b3803 100644 --- a/benches/json/Cargo.toml +++ b/benches/json/Cargo.toml @@ -17,7 +17,7 @@ lalrpop = "0.20.0" [dev-dependencies] criterion = { version = "0.4", features = ["html_reports"] } snmalloc-rs = { version = "0.3", features = ["build_cc"] } -pest = { version = "2.5.7", features = [ "std", "memchr" ] } +pest = { version = "2.5.7", features = ["std", "memchr"] } pest_derive = "2.5.7" lalrpop-util = { version = "0.20.0", features = ["lexer", "unicode"] } logos = "0.13.0" diff --git a/pag-lexer/src/lookahead.rs b/pag-lexer/src/lookahead.rs index 8f6c825..bc120d3 100644 --- a/pag-lexer/src/lookahead.rs +++ b/pag-lexer/src/lookahead.rs @@ -31,36 +31,27 @@ fn generate_lut_routine(index: usize) -> TokenStream { } } -fn byte_simd(byte: u8) -> TokenStream { - let byte = byte_char(byte); - quote! { data.simd_eq(u8x16::splat(#byte)) } -} - -fn range_simd(min: u8, max: u8) -> TokenStream { - let min = byte_char(min); - let max = byte_char(max); - quote! { data.simd_ge(u8x16::splat(#min)) & data.simd_le(u8x16::splat(#max)) } -} - +#[cfg(not(target_arch = "aarch64"))] fn generate_lookahead_routine(intervals: &Intervals, kind: Kind) -> TokenStream { - let count_act = match kind { - Kind::Positive => quote! { trailing_ones }, - Kind::Negative => quote! { trailing_zeros }, - }; - let idx_offset = intervals + let mask = intervals .iter() .map(|&Interval(l, r)| match l == r { - true => byte_simd(l), - false => range_simd(l, r), + true => { + let l = byte_char(l); + quote! { data.simd_eq(u8x16::splat(#l)) } + } + false => { + let l = byte_char(l); + let r = byte_char(r); + quote! { data.simd_ge(u8x16::splat(#l)) & data.simd_le(u8x16::splat(#r)) } + } }) .reduce(|acc, x| quote! { #acc | #x }) - .map(|x| { - if cfg!(target_arch = "aarch64") { - quote! { unsafe { core::mem::transmute::<_, u128>(#x).#count_act() / 8 } } - } else { - quote! { (#x).to_bitmask().#count_act() } - } - }); + .unwrap(); + let count_act = match kind { + Kind::Positive => quote! { trailing_ones }, + Kind::Negative => quote! { trailing_zeros }, + }; let tail_match = match kind { Kind::Positive => quote! { matches!(input.get(idx), Some(#intervals)) }, Kind::Negative => quote! { !matches!(input.get(idx), Some(#intervals) | None) }, @@ -70,7 +61,8 @@ fn generate_lookahead_routine(intervals: &Intervals, kind: Kind) -> TokenStream for chunk in input[idx..].chunks_exact(16) { use core::simd::*; let data = u8x16::from_slice(chunk); - let idx_offset = #idx_offset; + let mask = #mask; + let idx_offset = mask.to_bitmask().#count_act(); idx += idx_offset as usize; if idx_offset != 16 { break 'lookahead; @@ -83,10 +75,46 @@ fn generate_lookahead_routine(intervals: &Intervals, kind: Kind) -> TokenStream } } +#[cfg(target_arch = "aarch64")] +fn generate_lookahead_routine(intervals: &Intervals, kind: Kind) -> TokenStream { + let mask = intervals + .iter() + .map(|&Interval(l, r)| match l == r { + true => { + let l = byte_char(l); + quote! { data.simd_eq(u8x16::splat(#l)) } + } + false => { + let l = byte_char(l); + let r = byte_char(r); + quote! { data.simd_ge(u8x16::splat(#l)) & data.simd_le(u8x16::splat(#r)) } + } + }) + .reduce(|acc, x| quote! { #acc | #x }) + .unwrap(); + let count_act = match kind { + Kind::Positive => quote! { trailing_ones }, + Kind::Negative => quote! { trailing_zeros }, + }; + quote! { + for chunk in input[idx..].chunks_exact(16) { + use core::simd::*; + let data = u8x16::from_slice(chunk); + let mask = #mask; + let mask = unsafe { core::mem::transmute::<_, u128>(mask) }; + let idx_offset = mask.#count_act() / 8; + idx += idx_offset as usize; + if idx_offset != 16 { + break; + } + } + } +} + fn estimated_cost(intervals: &Intervals) -> u32 { intervals .iter() - .map(|Interval(l, r)| if l == r { 1 } else { 2 }) + .map(|Interval(l, r)| 1 + (l != r) as u32) .sum() } @@ -139,7 +167,7 @@ impl LoopOptimizer { } pub fn generate_lookahead(&mut self, dfa: &DfaTable, state: &DfaState) -> Option { - let limit = 8; + let limit = 4; let positives = direct_self_loops(dfa, state)?; let negatives = positives.complement()?; diff --git a/pag-lexer/src/vector.rs b/pag-lexer/src/vector.rs index 61488ae..e23ee50 100644 --- a/pag-lexer/src/vector.rs +++ b/pag-lexer/src/vector.rs @@ -159,6 +159,7 @@ impl Vector { return quote! { Some(#interval) => { cursor = idx + 1; #on_success }, }; } let target_id = dfa[target].state_id; + #[cfg(not(target_arch = "aarch64"))] if lookahead.is_some() && info.state_id == target_id { return quote! {}; } From 91b97100f53a419c92ec8ae9519f2870b476b3b7 Mon Sep 17 00:00:00 2001 From: QuarticCat Date: Sun, 30 Jul 2023 04:03:10 +0800 Subject: [PATCH 26/42] optimize lut lookahead --- pag-lexer/src/lookahead.rs | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/pag-lexer/src/lookahead.rs b/pag-lexer/src/lookahead.rs index bc120d3..b72d17c 100644 --- a/pag-lexer/src/lookahead.rs +++ b/pag-lexer/src/lookahead.rs @@ -19,15 +19,36 @@ enum Kind { } fn generate_lut_routine(index: usize) -> TokenStream { + // TODO: put the code to `pag_util::lookahead_lut` to reduce stack size under debug build let table = index / 8; let shift = index % 8; let bit = 1u8 << shift; quote! { - idx = idx - + input[idx..] + 'lookahead: { + for chunk in input[idx..].chunks_exact(8) { + if GLOBAL_LUT[#table][chunk[0] as usize] & #bit == 0 { + if GLOBAL_LUT[#table][chunk[1] as usize] & #bit == 0 { + if GLOBAL_LUT[#table][chunk[2] as usize] & #bit == 0 { + if GLOBAL_LUT[#table][chunk[3] as usize] & #bit == 0 { + if GLOBAL_LUT[#table][chunk[4] as usize] & #bit == 0 { + if GLOBAL_LUT[#table][chunk[5] as usize] & #bit == 0 { + if GLOBAL_LUT[#table][chunk[6] as usize] & #bit == 0 { + if GLOBAL_LUT[#table][chunk[7] as usize] & #bit == 0 { + idx += 8; continue; } + idx += 7; break 'lookahead; } + idx += 6; break 'lookahead; } + idx += 5; break 'lookahead; } + idx += 4; break 'lookahead; } + idx += 3; break 'lookahead; } + idx += 2; break 'lookahead; } + idx += 1; break 'lookahead; } + break 'lookahead; + } + idx += input[idx..] .iter() .position(|x| GLOBAL_LUT[#table][*x as usize] & #bit > 0) - .unwrap_or(input.len() - idx); + .unwrap_or(input[idx..].len()); + } } } From fada7c6f6061519fd2690ef6a7fb519c879fe6b5 Mon Sep 17 00:00:00 2001 From: QuarticCat Date: Sun, 30 Jul 2023 07:46:11 +0800 Subject: [PATCH 27/42] fix stack size issue --- benches/csv/Cargo.toml | 1 + benches/json/Cargo.toml | 1 + pag-lexer/src/lookahead.rs | 30 +---------------------- pag-util/Cargo.toml | 23 ++++++++++++++++++ pag-util/src/lib.rs | 40 +++++++++++++++++++++++++++++++ tests/arith-expr/Cargo.toml | 1 + tests/sexpr-calculator/Cargo.toml | 1 + tests/tokenizer/Cargo.toml | 1 + 8 files changed, 69 insertions(+), 29 deletions(-) create mode 100644 pag-util/Cargo.toml create mode 100644 pag-util/src/lib.rs diff --git a/benches/csv/Cargo.toml b/benches/csv/Cargo.toml index e00dc46..8333262 100644 --- a/benches/csv/Cargo.toml +++ b/benches/csv/Cargo.toml @@ -7,6 +7,7 @@ publish = false # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] +pag-util = { version = "0.1.0-alpha.1", path = "../../pag-util" } rand = { version = "0.8" } snmalloc-rs = { version = "0.3", features = ["build_cc"] } diff --git a/benches/json/Cargo.toml b/benches/json/Cargo.toml index 84b3803..dc705b3 100644 --- a/benches/json/Cargo.toml +++ b/benches/json/Cargo.toml @@ -7,6 +7,7 @@ publish = false autobenches = false [dependencies] +pag-util = { version = "0.1.0-alpha.1", path = "../../pag-util" } rand = { version = "0.8" } serde_json = "1.0" diff --git a/pag-lexer/src/lookahead.rs b/pag-lexer/src/lookahead.rs index b72d17c..0950294 100644 --- a/pag-lexer/src/lookahead.rs +++ b/pag-lexer/src/lookahead.rs @@ -19,37 +19,9 @@ enum Kind { } fn generate_lut_routine(index: usize) -> TokenStream { - // TODO: put the code to `pag_util::lookahead_lut` to reduce stack size under debug build let table = index / 8; let shift = index % 8; - let bit = 1u8 << shift; - quote! { - 'lookahead: { - for chunk in input[idx..].chunks_exact(8) { - if GLOBAL_LUT[#table][chunk[0] as usize] & #bit == 0 { - if GLOBAL_LUT[#table][chunk[1] as usize] & #bit == 0 { - if GLOBAL_LUT[#table][chunk[2] as usize] & #bit == 0 { - if GLOBAL_LUT[#table][chunk[3] as usize] & #bit == 0 { - if GLOBAL_LUT[#table][chunk[4] as usize] & #bit == 0 { - if GLOBAL_LUT[#table][chunk[5] as usize] & #bit == 0 { - if GLOBAL_LUT[#table][chunk[6] as usize] & #bit == 0 { - if GLOBAL_LUT[#table][chunk[7] as usize] & #bit == 0 { - idx += 8; continue; } - idx += 7; break 'lookahead; } - idx += 6; break 'lookahead; } - idx += 5; break 'lookahead; } - idx += 4; break 'lookahead; } - idx += 3; break 'lookahead; } - idx += 2; break 'lookahead; } - idx += 1; break 'lookahead; } - break 'lookahead; - } - idx += input[idx..] - .iter() - .position(|x| GLOBAL_LUT[#table][*x as usize] & #bit > 0) - .unwrap_or(input[idx..].len()); - } - } + quote! { idx = ::pag_util::lookahead_lut(input, idx, &GLOBAL_LUT[#table], #shift); } } #[cfg(not(target_arch = "aarch64"))] diff --git a/pag-util/Cargo.toml b/pag-util/Cargo.toml new file mode 100644 index 0000000..0a87fdf --- /dev/null +++ b/pag-util/Cargo.toml @@ -0,0 +1,23 @@ +# Copyright (c) 2023 Paguroidea Developers +# +# Licensed under the Apache License, Version 2.0 +# or the MIT +# license , at your +# option. All files in the project carrying such notice may not be copied, +# modified, or distributed except according to those terms. + +[package] +name = "pag-util" +keywords = ["parser", "cfg", "grammar"] +description = "Parser-lexer fusion generator (utilities)" +documentation = "https://docs.rs/pag-util/" + +version.workspace = true +edition.workspace = true +license.workspace = true +exclude.workspace = true +categories.workspace = true +repository.workspace = true +rust-version.workspace = true +authors.workspace = true +readme.workspace = true diff --git a/pag-util/src/lib.rs b/pag-util/src/lib.rs new file mode 100644 index 0000000..3cf6031 --- /dev/null +++ b/pag-util/src/lib.rs @@ -0,0 +1,40 @@ +use std::hint::unreachable_unchecked; + +#[doc(hidden)] +#[inline] +pub unsafe fn assume(cond: bool) { + if !cond { + unreachable_unchecked() + } +} + +#[doc(hidden)] +#[inline] +#[rustfmt::skip] +pub fn lookahead_lut(input: &[u8], mut idx: usize, table: &[u8; 256], shift: usize) -> usize { + let mask = 1 << shift; + for chunk in input[idx..].chunks_exact(8) { + if table[chunk[0] as usize] & mask == 0 { + if table[chunk[1] as usize] & mask == 0 { + if table[chunk[2] as usize] & mask == 0 { + if table[chunk[3] as usize] & mask == 0 { + if table[chunk[4] as usize] & mask == 0 { + if table[chunk[5] as usize] & mask == 0 { + if table[chunk[6] as usize] & mask == 0 { + if table[chunk[7] as usize] & mask == 0 { + idx += 8; continue; } + idx += 7; return idx; } + idx += 6; return idx; } + idx += 5; return idx; } + idx += 4; return idx; } + idx += 3; return idx; } + idx += 2; return idx; } + idx += 1; return idx; } + return idx; + } + unsafe { assume(idx <= input.len()) }; + idx + input[idx..] + .iter() + .position(|x| table[*x as usize] & mask > 0) + .unwrap_or(input[idx..].len()) +} diff --git a/tests/arith-expr/Cargo.toml b/tests/arith-expr/Cargo.toml index c761cf0..d495c11 100644 --- a/tests/arith-expr/Cargo.toml +++ b/tests/arith-expr/Cargo.toml @@ -6,6 +6,7 @@ build = "build.rs" publish = false [dependencies] +pag-util = { version = "0.1.0-alpha.1", path = "../../pag-util" } rand = { version = "0.8" } [build-dependencies] diff --git a/tests/sexpr-calculator/Cargo.toml b/tests/sexpr-calculator/Cargo.toml index 2440e44..cbe7938 100644 --- a/tests/sexpr-calculator/Cargo.toml +++ b/tests/sexpr-calculator/Cargo.toml @@ -6,6 +6,7 @@ build = "build.rs" publish = false [dependencies] +pag-util = { version = "0.1.0-alpha.1", path = "../../pag-util" } rand = { version = "0.8" } [build-dependencies] diff --git a/tests/tokenizer/Cargo.toml b/tests/tokenizer/Cargo.toml index 4de8101..d8cc23f 100644 --- a/tests/tokenizer/Cargo.toml +++ b/tests/tokenizer/Cargo.toml @@ -6,6 +6,7 @@ build = "build.rs" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] +pag-util = { version = "0.1.0-alpha.1", path = "../../pag-util" } rand = { version = "0.8" } [build-dependencies] From 997a0fdfd7927fa7986d146e7436e389b0dc6a29 Mon Sep 17 00:00:00 2001 From: QuarticCat Date: Sun, 30 Jul 2023 08:16:45 +0800 Subject: [PATCH 28/42] add assume all over the generated code --- pag-lexer/src/lookahead.rs | 2 ++ pag-lexer/src/vector.rs | 1 + pag-util/src/lib.rs | 1 + 3 files changed, 4 insertions(+) diff --git a/pag-lexer/src/lookahead.rs b/pag-lexer/src/lookahead.rs index 0950294..c1cad8c 100644 --- a/pag-lexer/src/lookahead.rs +++ b/pag-lexer/src/lookahead.rs @@ -51,6 +51,7 @@ fn generate_lookahead_routine(intervals: &Intervals, kind: Kind) -> TokenStream }; quote! { 'lookahead: { + unsafe { ::pag_util::assume(idx <= input.len()) }; for chunk in input[idx..].chunks_exact(16) { use core::simd::*; let data = u8x16::from_slice(chunk); @@ -90,6 +91,7 @@ fn generate_lookahead_routine(intervals: &Intervals, kind: Kind) -> TokenStream Kind::Negative => quote! { trailing_zeros }, }; quote! { + unsafe { ::pag_util::assume(idx <= input.len()) }; for chunk in input[idx..].chunks_exact(16) { use core::simd::*; let data = u8x16::from_slice(chunk); diff --git a/pag-lexer/src/vector.rs b/pag-lexer/src/vector.rs index e23ee50..11421bf 100644 --- a/pag-lexer/src/vector.rs +++ b/pag-lexer/src/vector.rs @@ -142,6 +142,7 @@ impl Vector { let on_success = &success_actions[rule_idx]; return quote! { State::#label => { + unsafe { ::pag_util::assume(idx <= input.len()) }; if input[idx..].starts_with(#literal) { cursor = idx + #length; #on_success diff --git a/pag-util/src/lib.rs b/pag-util/src/lib.rs index 3cf6031..6e2520f 100644 --- a/pag-util/src/lib.rs +++ b/pag-util/src/lib.rs @@ -13,6 +13,7 @@ pub unsafe fn assume(cond: bool) { #[rustfmt::skip] pub fn lookahead_lut(input: &[u8], mut idx: usize, table: &[u8; 256], shift: usize) -> usize { let mask = 1 << shift; + unsafe { assume(idx <= input.len()) }; for chunk in input[idx..].chunks_exact(8) { if table[chunk[0] as usize] & mask == 0 { if table[chunk[1] as usize] & mask == 0 { From b701d640feefaffacfd41e7ec6cefda73ec7edee Mon Sep 17 00:00:00 2001 From: Schrodinger ZHU Yifan Date: Thu, 27 Jul 2023 16:22:44 -0400 Subject: [PATCH 29/42] refactor semact structure --- pag-parser2/src/frontend/ast.rs | 58 ++++++++++++- pag-parser2/src/frontend/parse.rs | 4 +- pag-parser2/src/nf/inference.rs | 127 +++++++++++++--------------- pag-parser2/src/nf/mod.rs | 71 ++++++++++++---- pag-parser2/src/nf/normalization.rs | 1 + pag-parser2/src/nf/semact.rs | 36 +++++++- pag-parser2/src/nf/translation.rs | 64 ++++++++++++++ 7 files changed, 270 insertions(+), 91 deletions(-) create mode 100644 pag-parser2/src/nf/normalization.rs create mode 100644 pag-parser2/src/nf/translation.rs diff --git a/pag-parser2/src/frontend/ast.rs b/pag-parser2/src/frontend/ast.rs index 8de4969..4dce9d9 100644 --- a/pag-parser2/src/frontend/ast.rs +++ b/pag-parser2/src/frontend/ast.rs @@ -7,6 +7,7 @@ // modified, or distributed except according to those terms. use std::collections::HashMap; +use std::rc::Rc; pub struct Ast { pub entry: syn::Ident, @@ -15,6 +16,36 @@ pub struct Ast { pub parser_map: HashMap, } +#[derive(Clone)] +#[repr(transparent)] +pub struct CustomizedBlock(pub Rc); + +impl PartialEq for CustomizedBlock { + fn eq(&self, other: &Self) -> bool { + Rc::ptr_eq(&self.0, &other.0) + } +} + +impl Eq for CustomizedBlock {} + +impl PartialOrd for CustomizedBlock { + fn partial_cmp(&self, other: &Self) -> Option { + Rc::as_ptr(&self.0).partial_cmp(&Rc::as_ptr(&other.0)) + } +} + +impl Ord for CustomizedBlock { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + Rc::as_ptr(&self.0).cmp(&Rc::as_ptr(&other.0)) + } +} + +impl std::hash::Hash for CustomizedBlock { + fn hash(&self, state: &mut H) { + Rc::as_ptr(&self.0).hash(state) + } +} + pub struct LexerDef { pub idx: u32, pub expr: LexerExpr, @@ -27,7 +58,7 @@ pub struct ParserDef { pub struct ParserRule { pub vars: Vec, - pub action: Option, + pub action: Option, } pub struct VarBinding { @@ -60,3 +91,28 @@ pub enum ParserExpr { ParserRef(syn::Ident), Ignore(Box), } + +pub struct RightDeepIterator<'a> { + seq: Option<&'a LexerExpr>, +} + +impl<'a> From<&'a LexerExpr> for RightDeepIterator<'a> { + fn from(expr: &'a LexerExpr) -> Self { + Self { seq: Some(expr) } + } +} + +impl<'a> Iterator for RightDeepIterator<'a> { + type Item = &'a LexerExpr; + + fn next(&mut self) -> Option { + match self.seq { + Some(LexerExpr::Seq(a, b)) => { + self.seq = Some(b); + Some(a) + } + Some(_) => self.seq.take(), + None => None, + } + } +} diff --git a/pag-parser2/src/frontend/parse.rs b/pag-parser2/src/frontend/parse.rs index 093842c..9b782ad 100644 --- a/pag-parser2/src/frontend/parse.rs +++ b/pag-parser2/src/frontend/parse.rs @@ -128,7 +128,9 @@ impl Parse for ParserRule { let mut action = None; if input.peek(syn::token::Brace) { - action = Some(input.parse::()?); + action = Some(CustomizedBlock(std::rc::Rc::new( + input.parse::()?, + ))); } Ok(Self { vars, action }) diff --git a/pag-parser2/src/nf/inference.rs b/pag-parser2/src/nf/inference.rs index 7838780..a474b3d 100644 --- a/pag-parser2/src/nf/inference.rs +++ b/pag-parser2/src/nf/inference.rs @@ -71,14 +71,11 @@ // ------------------- // Γ ⊢ x : η -use std::collections::{HashMap}; +use std::collections::HashMap; use syn::{parse_quote, Type}; -use super::{ - semact::{SemAct, SemActTable}, - NormalForm, Tag, -}; +use super::{semact::SemAct, BoundTarget, NormalForm, Tag}; #[derive(Clone)] pub enum InferredType { @@ -93,8 +90,6 @@ pub struct InferenceContext<'a> { gamma: HashMap, /// Type annotations from user (for toplevel) annotations: &'a HashMap, - /// Semantic action table - semact: &'a SemActTable, /// Fully normalized terms nforms: &'a HashMap>, } @@ -102,25 +97,31 @@ impl<'a> InferenceContext<'a> { /// Create a new inference context pub fn new( annotations: &'a HashMap, - semact: &'a SemActTable, nforms: &'a HashMap>, ) -> Self { Self { gamma: HashMap::new(), annotations, - semact, nforms, } } - fn infer_gather<'i, I: Iterator>( + fn infer_gather<'i, I: Iterator>>( &mut self, mut tags: I, ) -> Option { if let Some(tag) = tags.next() { - let mut types = vec![self.infer(tag)?]; + let mut types = vec![if let BoundTarget::Tag(tag) = tag { + self.infer(tag)? + } else { + InferredType::Concrete(parse_quote! {::pag_runtime::Span<'src>}) + }]; for t in tags { // If any inference fails, the whole inference fails - let ty = self.infer(t)?; + let ty = if let BoundTarget::Tag(t) = t { + self.infer(t)? + } else { + InferredType::Concrete(parse_quote! {::pag_runtime::Span<'src>}) + }; types.push(ty); } if types.len() == 1 { @@ -152,78 +153,70 @@ impl<'a> InferenceContext<'a> { if let Some(x) = self.gamma.get(tag) { return Some(x.clone()); } - let target = - // If a concrete type annotation is provided, use it directly - if let Some(x) = self.annotations.get(tag) { - InferredType::Concrete(x.clone()) - } else { - let semact = self.semact.get(tag); + let target = if let Some(x) = self.annotations.get(tag) { + // If a concrete type annotation is provided, use it directly + InferredType::Concrete(x.clone()) + } else { + // find first subexpression that fulfills inference + let nfs = self.nforms.get(tag)?; + let mut inferred = None; + for i in nfs.iter() { + let semact = i.semact(); match semact { - // No semantic action, the type is unit - None => InferredType::Concrete(parse_quote!(())), // Token semantic action, the type is Span - Some(SemAct::Token) => { - InferredType::Concrete(parse_quote!(::pag_runtime::Span<'src>)) + SemAct::Token => { + inferred.replace(InferredType::Concrete(parse_quote!( + ::pag_runtime::Span<'src> + ))); + break; } - // Customized routine without type annotation -- inference failed - Some(SemAct::CustomizedRoutine(..)) => return None, + // Customized routine without type annotation, cannot infer + SemAct::CustomizedRoutine(..) => continue, // Nested routine for one or more, the type is unit. - Some(SemAct::OneOrMoreNested) => { - InferredType::Concrete(parse_quote!(())) + SemAct::OneOrMoreNested => { + inferred.replace(InferredType::Concrete(parse_quote!(()))); + break; } - Some(SemAct::Gather) => { - let nfs = self.nforms.get(tag)?; - let mut inferred = None; - // find first subexpression that fulfills inference - for i in nfs.iter() { - let visible = i.visible_bindings(0); - if let Some(gather_type) = - self.infer_gather(visible.into_iter().map(|x| x.1)) - { - inferred.replace(gather_type); - break; - } + SemAct::Gather => { + let visible = i.visible_bindings(0); + if let Some(gather_type) = + self.infer_gather(visible.into_iter().map(|x| x.1)) + { + inferred.replace(gather_type); + break; } - inferred? } - Some(SemAct::ZeroOrMore) - | Some(SemAct::Option) - | Some(SemAct::OneOrMoreToplevel) => { - let nfs = self.nforms.get(tag)?; + SemAct::ZeroOrMore | SemAct::Option | SemAct::OneOrMoreToplevel => { let mapper = |ty: InferredType| { - if matches!(semact, Some(SemAct::Option)) { + if matches!(semact, SemAct::Option) { InferredType::Option(Box::new(ty)) } else { InferredType::Collector(Box::new(ty)) } }; - let mut inferred = None; - // find first subexpression that fulfills inference - for i in nfs.iter() { - // Skip epsilon production, this is safe since OneOrMoreToplevel will never be empty - if let NormalForm::Empty(x) = i { - if x.is_empty() { - continue; - } - } - // skip the trailing part of OneOrMoreToplevel - let visible = i.visible_bindings( - if matches!(semact, Some(SemAct::OneOrMoreToplevel)) { - 1 - } else { - 0 - }, - ); - if let Some(gather_type) = - self.infer_gather(visible.into_iter().map(|x| x.1)) - { - inferred.replace(mapper(gather_type)); - break; + // Skip epsilon production, this is safe since OneOrMoreToplevel will never be empty + if let NormalForm::Empty(x, _) = i { + if x.is_empty() { + continue; } } - inferred? + // skip the trailing part of OneOrMoreToplevel + let visible = + i.visible_bindings(if matches!(semact, SemAct::OneOrMoreToplevel) { + 1 + } else { + 0 + }); + if let Some(gather_type) = + self.infer_gather(visible.into_iter().map(|x| x.1)) + { + inferred.replace(mapper(gather_type)); + break; + } } } + } + inferred? }; self.gamma.insert(tag.clone(), target.clone()); Some(target) diff --git a/pag-parser2/src/nf/mod.rs b/pag-parser2/src/nf/mod.rs index c26aa3f..764d18c 100644 --- a/pag-parser2/src/nf/mod.rs +++ b/pag-parser2/src/nf/mod.rs @@ -7,7 +7,9 @@ // modified, or distributed except according to those terms. mod inference; +mod normalization; mod semact; +mod translation; use std::{ collections::{HashMap, VecDeque}, @@ -20,6 +22,9 @@ use syn::Ident; #[cfg(feature = "debug")] use crate::debug::{styled, styled_write}; + +use self::semact::SemAct; + #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] pub enum Tag { Toplevel(Ident), @@ -94,38 +99,57 @@ impl std::fmt::Display for Action { } } -#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] pub enum NormalForm { - Empty(Vec<(Tag, Option)>), - Unexpanded(Vec), - Sequence(Ident, Vec), + Empty(Vec<(Tag, Option)>, SemAct), + Unexpanded(Vec, SemAct), + Sequence(Ident, Option, Vec, SemAct), +} + +pub enum BoundTarget<'a> { + Tag(&'a Tag), + Token, } impl NormalForm { - pub fn visible_bindings(&self, skip: usize) -> Vec<(&Ident, &Tag)> { + pub fn semact(&self) -> &SemAct { + match self { + Self::Empty(_, semact) + | Self::Unexpanded(_, semact) + | Self::Sequence(_, _, _, semact) => semact, + } + } + pub fn visible_bindings(&self, skip: usize) -> Vec<(&Ident, BoundTarget)> { match self { - Self::Empty(actions) => actions + Self::Empty(actions, _) => actions .last() - .and_then(|(tag, ident)| Some((ident.as_ref()?, tag))) + .and_then(|(tag, ident)| Some((ident.as_ref()?, BoundTarget::Tag(tag)))) .into_iter() .collect(), - Self::Unexpanded(actions) | Self::Sequence(_, actions) => { + Self::Unexpanded(actions, _) | Self::Sequence(_, _, actions, _) => { let mut acc = VecDeque::new(); for act in actions.iter().rev().skip(skip) { match act { Action::Shift { tag, output } => { if let Some(ident) = output { - acc.push_front((ident, tag)); + acc.push_front((ident, BoundTarget::Tag(tag))); } } Action::Reduce { tag, output } => { if let Some(ident) = output { - acc.push_front((ident, tag)); + acc.push_front((ident, BoundTarget::Tag(tag))); } break; } } } + if let Self::Sequence(_, Some(tk), _, _) = self { + if acc.len() == actions.len() - skip + && !matches!(actions.first(), Some(Action::Reduce { .. })) + { + acc.push_front((tk, BoundTarget::Token)); + } + } acc.into_iter().collect() } } @@ -136,7 +160,7 @@ impl NormalForm { impl std::fmt::Display for NormalForm { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { - Self::Empty(actions) => { + Self::Empty(actions, _) => { write!(f, "ε")?; for (tag, output) in actions.iter() { if let Some(name) = output { @@ -146,14 +170,18 @@ impl std::fmt::Display for NormalForm { } } } - Self::Unexpanded(actions) => { + Self::Unexpanded(actions, _) => { write!(f, "{}", actions[0])?; for action in &actions[1..] { write!(f, "\t{}", action)?; } } - Self::Sequence(terminal, actions) => { - styled_write!(f, Color::Yellow, "{terminal}")?; + Self::Sequence(terminal, var, actions, _) => { + if let Some(tk) = var { + styled_write!(f, Color::Yellow, "{terminal}[{tk}]")?; + } else { + styled_write!(f, Color::Yellow, "{terminal}")?; + } for action in actions.iter() { write!(f, "\t{}", action)?; } @@ -169,6 +197,7 @@ fn debug_print_test() { use quote::format_ident; let sequence = NormalForm::Sequence( format_ident!("TEST"), + Some(format_ident!("x")), vec![ Action::Shift { tag: Tag::Toplevel(format_ident!("a")), @@ -187,6 +216,7 @@ fn debug_print_test() { output: None, }, ], + SemAct::Gather, ); println!("{}", sequence); } @@ -237,6 +267,7 @@ fn debug_print_nf_table() { use quote::format_ident; let sequence = NormalForm::Sequence( format_ident!("TEST"), + Some(format_ident!("x")), vec![ Action::Shift { tag: Tag::Toplevel(format_ident!("a")), @@ -255,11 +286,15 @@ fn debug_print_nf_table() { output: None, }, ], + SemAct::Gather, + ); + let empty = NormalForm::Empty( + vec![ + (Tag::Toplevel(format_ident!("a")), None), + (Tag::Toplevel(format_ident!("b")), Some(format_ident!("x"))), + ], + SemAct::Gather, ); - let empty = NormalForm::Empty(vec![ - (Tag::Toplevel(format_ident!("a")), None), - (Tag::Toplevel(format_ident!("b")), Some(format_ident!("x"))), - ]); let table = NFTable( vec![ ( diff --git a/pag-parser2/src/nf/normalization.rs b/pag-parser2/src/nf/normalization.rs new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/pag-parser2/src/nf/normalization.rs @@ -0,0 +1 @@ + diff --git a/pag-parser2/src/nf/semact.rs b/pag-parser2/src/nf/semact.rs index 0dfd64b..c4b1e3e 100644 --- a/pag-parser2/src/nf/semact.rs +++ b/pag-parser2/src/nf/semact.rs @@ -6,11 +6,11 @@ // option. All files in the project carrying such notice may not be copied, // modified, or distributed except according to those terms. -use std::collections::HashMap; -use super::Tag; -pub type SemActTable = HashMap; + + +use crate::frontend::{CustomizedBlock, ParserExpr}; /// /// ``` @@ -21,8 +21,9 @@ pub type SemActTable = HashMap; /// ``` // those normal form without SemAct will be treated as plain scanner. +#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] pub enum SemAct { - CustomizedRoutine(syn::Block), + CustomizedRoutine(CustomizedBlock), /// Gather inner data. If multiple is selected, return a tuple. /// If only one is selected, return target data. Gather, @@ -41,3 +42,30 @@ pub enum SemAct { /// Yield a token span, Token, } + +impl SemAct { + pub fn infer(expr: &ParserExpr) -> Self { + match expr { + ParserExpr::LexerRef(_) => SemAct::Token, + ParserExpr::Plus(_) => SemAct::OneOrMoreToplevel, + ParserExpr::Opt(_) => SemAct::Option, + ParserExpr::Star(_) => SemAct::ZeroOrMore, + _ => SemAct::Gather, + } + } +} + +#[cfg(feature = "debug")] +impl std::fmt::Display for SemAct { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + SemAct::CustomizedRoutine(x) => write!(f, "{:?}", std::rc::Rc::as_ptr(&x.0)), + SemAct::Gather => write!(f, "Gather"), + SemAct::Option => write!(f, "Option"), + SemAct::ZeroOrMore => write!(f, "ZeroOrMore"), + SemAct::OneOrMoreToplevel => write!(f, "OneOrMoreToplevel"), + SemAct::OneOrMoreNested => write!(f, "OneOrMoreNested"), + SemAct::Token => write!(f, "Token"), + } + } +} diff --git a/pag-parser2/src/nf/translation.rs b/pag-parser2/src/nf/translation.rs new file mode 100644 index 0000000..5ef09ce --- /dev/null +++ b/pag-parser2/src/nf/translation.rs @@ -0,0 +1,64 @@ +//! +//! Transform from surface syntax to semi-normalized form +//! + +use std::collections::HashMap; + +use quote::format_ident; +use syn::{Ident, Type}; + +use crate::frontend::{ParserDef}; + +use super::{semact::SemAct, NormalForm, Tag}; + +struct Translation { + /// Table of semi-normalized production rules + semi_nfs: HashMap>, + /// Toplevel type annotations + annotations: HashMap, + /// Type hints when calling inner routines (collector) + hints: HashMap, + /// Counter of assigned non-explicit variable names + output_cnt: usize, + /// Counter of assigned anonymous routines + anonymous_cnt: usize, +} + +impl Translation { + // Allocate a new symbol for unamed variable bindings. + fn new_output_sym(&mut self) -> Ident { + let result = format_ident!("_{}", self.output_cnt); + self.output_cnt += 1; + result + } + // Allocate a new tag for anonymous routines. + fn new_anonymous_tag(&mut self) -> Tag { + let result = Tag::Anonymous(self.anonymous_cnt); + self.anonymous_cnt += 1; + result + } + // Translate a top-level definitioin + fn add_toplevel_def(&mut self, name: Ident, def: &ParserDef) { + let tag = Tag::Toplevel(name); + self.annotations.insert(tag.clone(), def.ty.clone()); + let rules = def + .rules + .iter() + .map(|rule| { + let semact = if let Some(x) = rule.action.clone() { + SemAct::CustomizedRoutine(x) + } else if rule.vars.len() == 1 { + SemAct::infer(&rule.vars[0].expr) + } else { + SemAct::Gather + }; + match semact { + SemAct::Gather | SemAct::CustomizedRoutine(_) => {} + _ => {} + } + todo!() + }) + .collect(); + self.semi_nfs.insert(tag, rules); + } +} From 0e1dab09eba445a980b9ada49259ad839a96987a Mon Sep 17 00:00:00 2001 From: Schrodinger ZHU Yifan Date: Thu, 27 Jul 2023 22:20:51 -0400 Subject: [PATCH 30/42] allow inner collector to be hinted --- pag-parser2/src/frontend/ast.rs | 2 +- pag-parser2/src/frontend/parse.rs | 20 +++++++++++++++++--- pag-parser2/src/nf/mod.rs | 1 - pag-parser2/src/nf/semact.rs | 4 ---- pag-parser2/src/nf/translation.rs | 4 ++-- 5 files changed, 20 insertions(+), 11 deletions(-) diff --git a/pag-parser2/src/frontend/ast.rs b/pag-parser2/src/frontend/ast.rs index 4dce9d9..9912d84 100644 --- a/pag-parser2/src/frontend/ast.rs +++ b/pag-parser2/src/frontend/ast.rs @@ -64,7 +64,6 @@ pub struct ParserRule { pub struct VarBinding { pub expr: ParserExpr, pub name: Option, - pub ty: Option, } // TODO: how to express "bottom" & "any"? @@ -90,6 +89,7 @@ pub enum ParserExpr { LexerRef(syn::Ident), ParserRef(syn::Ident), Ignore(Box), + Hinted(Box, syn::Type), } pub struct RightDeepIterator<'a> { diff --git a/pag-parser2/src/frontend/parse.rs b/pag-parser2/src/frontend/parse.rs index 9b782ad..e70a51b 100644 --- a/pag-parser2/src/frontend/parse.rs +++ b/pag-parser2/src/frontend/parse.rs @@ -140,7 +140,7 @@ impl Parse for ParserRule { impl Parse for VarBinding { // ParserExpr ("[" syn::Ident (":" syn::Type)? "]")? fn parse(input: ParseStream) -> Result { - let expr = input.parse::()?; + let mut expr = input.parse::()?; let mut name = None; let mut ty = None; @@ -159,8 +159,10 @@ impl Parse for VarBinding { return Err(content.error("expected `]`")); } } - - Ok(Self { expr, name, ty }) + if let Some(ty) = ty { + expr = ParserExpr::Hinted(Box::new(expr), ty); + } + Ok(Self { expr, name }) } } @@ -313,6 +315,15 @@ fn parse_parser_expr(input: ParseStream, min_bp: u32) -> Result { lhs = ParserExpr::Seq(Box::new(lhs), Box::new(rhs)); continue; } + fn peek_and_parse_type(input: ParseStream, expr: ParserExpr) -> Result { + Ok(if input.peek(Token!(:)) { + input.parse::()?; + let ty = input.parse::()?; + ParserExpr::Hinted(Box::new(expr), ty) + } else { + expr + }) + } if input.peek(Token![*]) { let l_bp = 70; if l_bp < min_bp { @@ -320,6 +331,7 @@ fn parse_parser_expr(input: ParseStream, min_bp: u32) -> Result { } input.parse::()?; lhs = ParserExpr::Star(Box::new(lhs)); + lhs = peek_and_parse_type(input, lhs)?; continue; } if input.peek(Token![+]) { @@ -329,6 +341,7 @@ fn parse_parser_expr(input: ParseStream, min_bp: u32) -> Result { } input.parse::()?; lhs = ParserExpr::Plus(Box::new(lhs)); + lhs = peek_and_parse_type(input, lhs)?; continue; } if input.peek(Token![?]) { @@ -363,6 +376,7 @@ mod test { #[test] fn test_parser_expr() { syn::parse_str::(r#"A? b c* D+ F?"#).unwrap(); + syn::parse_str::(r#"A? b c* (key value*:Vec<_>)+:HashMap<_, _> F?"#).unwrap(); } #[test] diff --git a/pag-parser2/src/nf/mod.rs b/pag-parser2/src/nf/mod.rs index 764d18c..6a19f75 100644 --- a/pag-parser2/src/nf/mod.rs +++ b/pag-parser2/src/nf/mod.rs @@ -22,7 +22,6 @@ use syn::Ident; #[cfg(feature = "debug")] use crate::debug::{styled, styled_write}; - use self::semact::SemAct; #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] diff --git a/pag-parser2/src/nf/semact.rs b/pag-parser2/src/nf/semact.rs index c4b1e3e..69fa6d2 100644 --- a/pag-parser2/src/nf/semact.rs +++ b/pag-parser2/src/nf/semact.rs @@ -6,10 +6,6 @@ // option. All files in the project carrying such notice may not be copied, // modified, or distributed except according to those terms. - - - - use crate::frontend::{CustomizedBlock, ParserExpr}; /// diff --git a/pag-parser2/src/nf/translation.rs b/pag-parser2/src/nf/translation.rs index 5ef09ce..f93cf18 100644 --- a/pag-parser2/src/nf/translation.rs +++ b/pag-parser2/src/nf/translation.rs @@ -7,7 +7,7 @@ use std::collections::HashMap; use quote::format_ident; use syn::{Ident, Type}; -use crate::frontend::{ParserDef}; +use crate::frontend::ParserDef; use super::{semact::SemAct, NormalForm, Tag}; @@ -37,7 +37,7 @@ impl Translation { self.anonymous_cnt += 1; result } - // Translate a top-level definitioin + // Translate a top-level definition fn add_toplevel_def(&mut self, name: Ident, def: &ParserDef) { let tag = Tag::Toplevel(name); self.annotations.insert(tag.clone(), def.ty.clone()); From a76362ad5bd00874388ade21a950967a8ccfe391 Mon Sep 17 00:00:00 2001 From: Schrodinger ZHU Yifan Date: Fri, 28 Jul 2023 00:36:55 -0400 Subject: [PATCH 31/42] stage work for translation --- pag-parser2/src/nf/translation.rs | 54 +++++++++++++++++++++++++++++-- 1 file changed, 52 insertions(+), 2 deletions(-) diff --git a/pag-parser2/src/nf/translation.rs b/pag-parser2/src/nf/translation.rs index f93cf18..0dccb81 100644 --- a/pag-parser2/src/nf/translation.rs +++ b/pag-parser2/src/nf/translation.rs @@ -7,9 +7,9 @@ use std::collections::HashMap; use quote::format_ident; use syn::{Ident, Type}; -use crate::frontend::ParserDef; +use crate::frontend::{ParserDef, ParserExpr}; -use super::{semact::SemAct, NormalForm, Tag}; +use super::{semact::SemAct, NormalForm, Tag, Action}; struct Translation { /// Table of semi-normalized production rules @@ -37,6 +37,56 @@ impl Translation { self.anonymous_cnt += 1; result } + fn construct_actions_from_expr_sequence<'a, I>(&mut self, stream: I) -> Vec + where + I: Iterator)>, + { + stream.map(|(expr, output )| { + match expr { + ParserExpr::ParserRef(rule) => + Action::Shift { tag: Tag::Toplevel(rule.clone()), output: output.or_else(|| Some(self.new_output_sym())) }, + ParserExpr::LexerRef(_) => + Action::Shift { tag: self.add_anonymous_rule(expr), output }, + ParserExpr::Ignore(inner) => + Action::Shift { tag: self.add_anonymous_rule(inner), output: None }, + ParserExpr::Hinted( inner, ty) => { + let tag = self.add_anonymous_rule(inner); + self.hints.insert(tag.clone(), ty.clone()); + Action::Shift { tag, output: None } + } + _ => + Action::Shift { tag: self.add_anonymous_rule(expr), output: output.or_else(|| Some(self.new_output_sym())) }, + } + }).collect() + } + fn construct_nf_from_expr_sequence<'a, I>(&mut self, mut stream: I, semact: SemAct) -> NormalForm + where + I: Iterator)>, + { + let head = stream.next(); + match head { + None => NormalForm::Empty(vec![], semact), + // Token rule is ignored on default, but can be used to specify the label. + Some((ParserExpr::LexerRef(token), label)) => { + let actions = self.construct_actions_from_expr_sequence(stream); + NormalForm::Sequence(token.clone(), label, actions, semact) + } + Some(_) => { + let recovered = head.into_iter().chain(stream); + let actions = self.construct_actions_from_expr_sequence(recovered); + NormalForm::Unexpanded(actions, semact) + } + } + } + + fn add_anonymous_rule(&mut self, expr: &ParserExpr) -> Tag { + // Must be primitive rules + + let tag = self.new_anonymous_tag(); + let semact = SemAct::infer(expr); + + } + // Translate a top-level definition fn add_toplevel_def(&mut self, name: Ident, def: &ParserDef) { let tag = Tag::Toplevel(name); From 3e767f08a5237a5b8f1b450e8557a8415a6cb882 Mon Sep 17 00:00:00 2001 From: Schrodinger ZHU Yifan Date: Sat, 29 Jul 2023 14:54:04 -0400 Subject: [PATCH 32/42] add markers for tailcall --- pag-parser2/src/frontend/ast.rs | 10 +- pag-parser2/src/lib.rs | 1 + pag-parser2/src/nf/inference.rs | 4 + pag-parser2/src/nf/mod.rs | 4 + pag-parser2/src/nf/semact.rs | 3 + pag-parser2/src/nf/translation.rs | 314 +++++++++++++++++++++++++----- 6 files changed, 286 insertions(+), 50 deletions(-) diff --git a/pag-parser2/src/frontend/ast.rs b/pag-parser2/src/frontend/ast.rs index 9912d84..9877b44 100644 --- a/pag-parser2/src/frontend/ast.rs +++ b/pag-parser2/src/frontend/ast.rs @@ -93,21 +93,21 @@ pub enum ParserExpr { } pub struct RightDeepIterator<'a> { - seq: Option<&'a LexerExpr>, + seq: Option<&'a ParserExpr>, } -impl<'a> From<&'a LexerExpr> for RightDeepIterator<'a> { - fn from(expr: &'a LexerExpr) -> Self { +impl<'a> From<&'a ParserExpr> for RightDeepIterator<'a> { + fn from(expr: &'a ParserExpr) -> Self { Self { seq: Some(expr) } } } impl<'a> Iterator for RightDeepIterator<'a> { - type Item = &'a LexerExpr; + type Item = &'a ParserExpr; fn next(&mut self) -> Option { match self.seq { - Some(LexerExpr::Seq(a, b)) => { + Some(ParserExpr::Seq(a, b)) => { self.seq = Some(b); Some(a) } diff --git a/pag-parser2/src/lib.rs b/pag-parser2/src/lib.rs index 85c0851..932e93c 100644 --- a/pag-parser2/src/lib.rs +++ b/pag-parser2/src/lib.rs @@ -6,6 +6,7 @@ // option. All files in the project carrying such notice may not be copied, // modified, or distributed except according to those terms. +#![feature(box_patterns)] #[cfg(feature = "debug")] mod debug; mod frontend; diff --git a/pag-parser2/src/nf/inference.rs b/pag-parser2/src/nf/inference.rs index a474b3d..b614faa 100644 --- a/pag-parser2/src/nf/inference.rs +++ b/pag-parser2/src/nf/inference.rs @@ -163,6 +163,10 @@ impl<'a> InferenceContext<'a> { for i in nfs.iter() { let semact = i.semact(); match semact { + SemAct::Recognize => { + inferred.replace(InferredType::Concrete(parse_quote!(()))); + break; + } // Token semantic action, the type is Span SemAct::Token => { inferred.replace(InferredType::Concrete(parse_quote!( diff --git a/pag-parser2/src/nf/mod.rs b/pag-parser2/src/nf/mod.rs index 6a19f75..f314d0f 100644 --- a/pag-parser2/src/nf/mod.rs +++ b/pag-parser2/src/nf/mod.rs @@ -74,6 +74,8 @@ pub enum Action { tag: Tag, output: Option, }, + /// Specialized action for tail call optimization. + TailCall } #[cfg(feature = "debug")] @@ -94,6 +96,7 @@ impl std::fmt::Display for Action { styled_write!(f, Color::Red, "{tag}") } } + Self::TailCall => styled_write!(f, Color::Green, "↻"), } } } @@ -140,6 +143,7 @@ impl NormalForm { } break; } + Action::TailCall => continue, } } if let Self::Sequence(_, Some(tk), _, _) = self { diff --git a/pag-parser2/src/nf/semact.rs b/pag-parser2/src/nf/semact.rs index 69fa6d2..9dec982 100644 --- a/pag-parser2/src/nf/semact.rs +++ b/pag-parser2/src/nf/semact.rs @@ -37,6 +37,8 @@ pub enum SemAct { OneOrMoreNested, /// Yield a token span, Token, + /// Recognize without generate any data. + Recognize, } impl SemAct { @@ -62,6 +64,7 @@ impl std::fmt::Display for SemAct { SemAct::OneOrMoreToplevel => write!(f, "OneOrMoreToplevel"), SemAct::OneOrMoreNested => write!(f, "OneOrMoreNested"), SemAct::Token => write!(f, "Token"), + SemAct::Recognize => write!(f, "Recognize"), } } } diff --git a/pag-parser2/src/nf/translation.rs b/pag-parser2/src/nf/translation.rs index 0dccb81..17d22bf 100644 --- a/pag-parser2/src/nf/translation.rs +++ b/pag-parser2/src/nf/translation.rs @@ -7,10 +7,10 @@ use std::collections::HashMap; use quote::format_ident; use syn::{Ident, Type}; +use super::{semact::SemAct, Action, NormalForm, Tag}; +use crate::frontend::RightDeepIterator; use crate::frontend::{ParserDef, ParserExpr}; -use super::{semact::SemAct, NormalForm, Tag, Action}; - struct Translation { /// Table of semi-normalized production rules semi_nfs: HashMap>, @@ -22,9 +22,18 @@ struct Translation { output_cnt: usize, /// Counter of assigned anonymous routines anonymous_cnt: usize, + /// Whether we are currently ignoring the output + ignoring: bool, } impl Translation { + fn start_ignoring(&mut self) { + self.ignoring = true; + } + fn end_ignore(&mut self) { + self.ignoring = false; + } + // Allocate a new symbol for unamed variable bindings. fn new_output_sym(&mut self) -> Ident { let result = format_ident!("_{}", self.output_cnt); @@ -37,54 +46,269 @@ impl Translation { self.anonymous_cnt += 1; result } - fn construct_actions_from_expr_sequence<'a, I>(&mut self, stream: I) -> Vec - where - I: Iterator)>, - { - stream.map(|(expr, output )| { - match expr { - ParserExpr::ParserRef(rule) => - Action::Shift { tag: Tag::Toplevel(rule.clone()), output: output.or_else(|| Some(self.new_output_sym())) }, - ParserExpr::LexerRef(_) => - Action::Shift { tag: self.add_anonymous_rule(expr), output }, - ParserExpr::Ignore(inner) => - Action::Shift { tag: self.add_anonymous_rule(inner), output: None }, - ParserExpr::Hinted( inner, ty) => { - let tag = self.add_anonymous_rule(inner); - self.hints.insert(tag.clone(), ty.clone()); - Action::Shift { tag, output: None } - } - _ => - Action::Shift { tag: self.add_anonymous_rule(expr), output: output.or_else(|| Some(self.new_output_sym())) }, - } - }).collect() + fn add_nf(&mut self, tag: Tag, nf: NormalForm) { + self.semi_nfs.entry(tag).or_default().push(nf); } - fn construct_nf_from_expr_sequence<'a, I>(&mut self, mut stream: I, semact: SemAct) -> NormalForm - where - I: Iterator)>, - { - let head = stream.next(); - match head { - None => NormalForm::Empty(vec![], semact), - // Token rule is ignored on default, but can be used to specify the label. - Some((ParserExpr::LexerRef(token), label)) => { - let actions = self.construct_actions_from_expr_sequence(stream); - NormalForm::Sequence(token.clone(), label, actions, semact) + fn add_nf_from_anonymous_expr(&mut self, expr: &ParserExpr, tag: &Tag) { + match expr { + ParserExpr::Seq(box ParserExpr::Ignore(box ParserExpr::LexerRef(head)), tail) => { + let tail_actions = RightDeepIterator::from(tail.as_ref()) + .map(|inner| self.add_anonymous_rule(inner)) + .map(|(tag, output)| Action::Shift { tag, output }) + .collect(); + self.add_nf( + tag.clone(), + NormalForm::Sequence( + head.clone(), + None, + tail_actions, + if self.ignoring { + SemAct::Recognize + } else { + SemAct::Gather + }, + ), + ); + } + ParserExpr::Seq(box ParserExpr::LexerRef(head), tail) => { + let tail_actions = RightDeepIterator::from(tail.as_ref()) + .map(|inner| self.add_anonymous_rule(inner)) + .map(|(tag, output)| Action::Shift { tag, output }) + .collect(); + let nf = NormalForm::Sequence( + head.clone(), + if self.ignoring { + None + } else { + Some(self.new_output_sym()) + }, + tail_actions, + if self.ignoring { + SemAct::Recognize + } else { + SemAct::Gather + }, + ); + self.add_nf(tag.clone(), nf); + } + ParserExpr::Seq(_, _) => { + let actions = RightDeepIterator::from(expr) + .map(|inner| self.add_anonymous_rule(inner)) + .map(|(tag, output)| Action::Shift { tag, output }) + .collect(); + let nf = NormalForm::Unexpanded( + actions, + if self.ignoring { + SemAct::Recognize + } else { + SemAct::Gather + }, + ); + self.add_nf(tag.clone(), nf); + } + ParserExpr::Opt(box ParserExpr::Seq( + box ParserExpr::Ignore(box ParserExpr::LexerRef(head)), + tail, + )) => { + let tail_actions = RightDeepIterator::from(tail.as_ref()) + .map(|inner| self.add_anonymous_rule(inner)) + .map(|(tag, output)| Action::Shift { tag, output }) + .collect(); + self.add_nf( + tag.clone(), + NormalForm::Sequence( + head.clone(), + None, + tail_actions, + if self.ignoring { + SemAct::Recognize + } else { + SemAct::Option + }, + ), + ); + self.add_nf(tag.clone(), NormalForm::Empty(vec![], SemAct::Option)); + } + ParserExpr::Opt(box ParserExpr::Seq(box ParserExpr::LexerRef(head), tail)) => { + let tail_actions = RightDeepIterator::from(tail.as_ref()) + .map(|inner| self.add_anonymous_rule(inner)) + .map(|(tag, output)| Action::Shift { tag, output }) + .collect(); + let nf = NormalForm::Sequence( + head.clone(), + if self.ignoring { + None + } else { + Some(self.new_output_sym()) + }, + tail_actions, + if self.ignoring { + SemAct::Recognize + } else { + SemAct::Option + }, + ); + self.add_nf(tag.clone(), nf); + self.add_nf(tag.clone(), NormalForm::Empty(vec![], SemAct::Option)); + } + ParserExpr::Opt(inner) => { + let actions = RightDeepIterator::from(inner.as_ref()) + .map(|inner| self.add_anonymous_rule(inner)) + .map(|(tag, output)| Action::Shift { tag, output }) + .collect(); + let nf = NormalForm::Unexpanded( + actions, + if self.ignoring { + SemAct::Recognize + } else { + SemAct::Option + }, + ); + self.add_nf(tag.clone(), nf); + self.add_nf(tag.clone(), NormalForm::Empty(vec![], SemAct::Option)); } - Some(_) => { - let recovered = head.into_iter().chain(stream); - let actions = self.construct_actions_from_expr_sequence(recovered); - NormalForm::Unexpanded(actions, semact) + ParserExpr::Star(box ParserExpr::Seq( + box ParserExpr::Ignore(box ParserExpr::LexerRef(head)), + tail, + )) => { + let tail_actions = RightDeepIterator::from(tail.as_ref()) + .map(|inner| self.add_anonymous_rule(inner)) + .map(|(tag, output)| Action::Shift { tag, output }) + .collect(); + self.add_nf( + tag.clone(), + NormalForm::Sequence( + head.clone(), + None, + tail_actions, + if self.ignoring { + SemAct::Recognize + } else { + SemAct::ZeroOrMore + }, + ), + ); + self.add_nf(tag.clone(), NormalForm::Empty(vec![], SemAct::ZeroOrMore)); } + ParserExpr::Star(box ParserExpr::Seq(box ParserExpr::LexerRef(head), tail)) => { + let tail_actions = RightDeepIterator::from(tail.as_ref()) + .map(|inner| self.add_anonymous_rule(inner)) + .map(|(tag, output)| Action::Shift { tag, output }) + .collect(); + let nf = NormalForm::Sequence( + head.clone(), + if self.ignoring { + None + } else { + Some(self.new_output_sym()) + }, + tail_actions, + if self.ignoring { + SemAct::Recognize + } else { + SemAct::ZeroOrMore + }, + ); + self.add_nf(tag.clone(), nf); + self.add_nf(tag.clone(), NormalForm::Empty(vec![], SemAct::ZeroOrMore)); + } + ParserExpr::Star(inner) => { + let actions = RightDeepIterator::from(inner.as_ref()) + .map(|inner| self.add_anonymous_rule(inner)) + .map(|(tag, output)| Action::Shift { tag, output }) + .collect(); + let nf = NormalForm::Unexpanded( + actions, + if self.ignoring { + SemAct::Recognize + } else { + SemAct::ZeroOrMore + }, + ); + self.add_nf(tag.clone(), nf); + self.add_nf(tag.clone(), NormalForm::Empty(vec![], SemAct::ZeroOrMore)); + } + ParserExpr::Plus(box ParserExpr::Seq( + box ParserExpr::Ignore(box ParserExpr::LexerRef(head)), + tail, + )) => { + let tail_actions = RightDeepIterator::from(tail.as_ref()) + .map(|inner| self.add_anonymous_rule(inner)) + .map(|(tag, output)| Action::Shift { tag, output }) + .collect(); + self.add_nf( + tag.clone(), + NormalForm::Sequence( + head.clone(), + None, + tail_actions, + if self.ignoring { + SemAct::Recognize + } else { + SemAct::OneOrMoreToplevel + }, + ), + ); + self.add_nf(tag.clone(), NormalForm::Empty(vec![], SemAct::ZeroOrMore)); + } + ParserExpr::Plus(box ParserExpr::Seq(box ParserExpr::LexerRef(head), tail)) => { + let tail_actions = RightDeepIterator::from(tail.as_ref()) + .map(|inner| self.add_anonymous_rule(inner)) + .map(|(tag, output)| Action::Shift { tag, output }) + .collect(); + let nf = NormalForm::Sequence( + head.clone(), + if self.ignoring { + None + } else { + Some(self.new_output_sym()) + }, + tail_actions, + if self.ignoring { + SemAct::Recognize + } else { + SemAct::ZeroOrMore + }, + ); + self.add_nf(tag.clone(), nf); + self.add_nf(tag.clone(), NormalForm::Empty(vec![], SemAct::ZeroOrMore)); + } + ParserExpr::Plus(inner) => { + let actions = RightDeepIterator::from(inner.as_ref()) + .map(|inner| self.add_anonymous_rule(inner)) + .map(|(tag, output)| Action::Shift { tag, output }) + .collect(); + let nf = NormalForm::Unexpanded( + actions, + if self.ignoring { + SemAct::Recognize + } else { + SemAct::ZeroOrMore + }, + ); + self.add_nf(tag.clone(), nf); + self.add_nf(tag.clone(), NormalForm::Empty(vec![], SemAct::ZeroOrMore)); + } + ParserExpr::LexerRef(ident) => { + let nf = if self.ignoring { + NormalForm::Sequence(ident.clone(), None, vec![], SemAct::Recognize) + } else { + NormalForm::Sequence( + ident.clone(), + Some(self.new_output_sym()), + vec![], + SemAct::Token, + ) + }; + self.add_nf(tag.clone(), nf); + } + ParserExpr::ParserRef(_) => unreachable!("cannot create nf from parser ref"), + ParserExpr::Ignore(_) => unreachable!("cannot create nf from ignore"), + ParserExpr::Hinted(_, _) => unreachable!("cannot create nf from hinted"), } } - - fn add_anonymous_rule(&mut self, expr: &ParserExpr) -> Tag { - // Must be primitive rules - - let tag = self.new_anonymous_tag(); - let semact = SemAct::infer(expr); - + fn add_anonymous_rule(&mut self, expr: &ParserExpr) -> (Tag, Option) { + todo!() } // Translate a top-level definition From a9a61ca7abe1513e25aa83e3d1cfbbd2bfb207fc Mon Sep 17 00:00:00 2001 From: Schrodinger ZHU Yifan Date: Sat, 29 Jul 2023 23:07:11 -0400 Subject: [PATCH 33/42] implement translation --- pag-parser2/src/frontend/parse.rs | 12 +- pag-parser2/src/nf/mod.rs | 50 +++- pag-parser2/src/nf/translation.rs | 444 ++++++++++++++---------------- 3 files changed, 263 insertions(+), 243 deletions(-) diff --git a/pag-parser2/src/frontend/parse.rs b/pag-parser2/src/frontend/parse.rs index e70a51b..32bba36 100644 --- a/pag-parser2/src/frontend/parse.rs +++ b/pag-parser2/src/frontend/parse.rs @@ -140,7 +140,7 @@ impl Parse for ParserRule { impl Parse for VarBinding { // ParserExpr ("[" syn::Ident (":" syn::Type)? "]")? fn parse(input: ParseStream) -> Result { - let mut expr = input.parse::()?; + let mut expr = parse_parser_expr(input, 0, true)?; let mut name = None; let mut ty = None; @@ -276,12 +276,12 @@ fn parse_lexer_expr(input: ParseStream, min_bp: u32) -> Result { impl Parse for ParserExpr { fn parse(input: ParseStream) -> Result { - parse_parser_expr(input, 0) + parse_parser_expr(input, 0, false) } } // pratt parsing -fn parse_parser_expr(input: ParseStream, min_bp: u32) -> Result { +fn parse_parser_expr(input: ParseStream, min_bp: u32, is_toplevel: bool) -> Result { let mut lhs = 'lhs: { if input.peek(syn::Ident) { let ident = input.parse::()?.unraw(); @@ -299,19 +299,19 @@ fn parse_parser_expr(input: ParseStream, min_bp: u32) -> Result { if input.peek(Token![#]) { input.parse::()?; let r_bp = 60; - let rhs = parse_parser_expr(input, r_bp)?; + let rhs = parse_parser_expr(input, r_bp, is_toplevel)?; break 'lhs ParserExpr::Ignore(Box::new(rhs)); } return Err(input.error("expected parser expression")); }; loop { - if input.peek(syn::Ident) || input.peek(syn::token::Paren) || input.peek(Token![#]) { + if !is_toplevel && (input.peek(syn::Ident) || input.peek(syn::token::Paren) || input.peek(Token![#])) { let (l_bp, r_bp) = (40, 41); if l_bp < min_bp { break; } - let rhs = parse_parser_expr(input, r_bp)?; + let rhs = parse_parser_expr(input, r_bp, is_toplevel)?; lhs = ParserExpr::Seq(Box::new(lhs), Box::new(rhs)); continue; } diff --git a/pag-parser2/src/nf/mod.rs b/pag-parser2/src/nf/mod.rs index f314d0f..f1e1ceb 100644 --- a/pag-parser2/src/nf/mod.rs +++ b/pag-parser2/src/nf/mod.rs @@ -13,7 +13,7 @@ mod translation; use std::{ collections::{HashMap, VecDeque}, - ops::Deref, + ops::{Deref, DerefMut}, }; use quote::format_ident; @@ -51,7 +51,7 @@ impl std::fmt::Display for Tag { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Tag::Toplevel(ident) => write!(f, "{ident}"), - Tag::Anonymous(index) => styled_write!(f, Style::new().italic(), "_{index}"), + Tag::Anonymous(index) => styled_write!(f, Style::new().italic(), "_A{index}"), } } } @@ -75,7 +75,9 @@ pub enum Action { output: Option, }, /// Specialized action for tail call optimization. - TailCall + TailCall, + /// Specialized action for passing collector to subroutines. + PassCollector(Tag), } #[cfg(feature = "debug")] @@ -97,6 +99,7 @@ impl std::fmt::Display for Action { } } Self::TailCall => styled_write!(f, Color::Green, "↻"), + Self::PassCollector(tag) => styled_write!(f, Color::Green, "⇒{tag}"), } } } @@ -121,6 +124,39 @@ impl NormalForm { | Self::Sequence(_, _, _, semact) => semact, } } + pub fn semact_mut(&mut self) -> &mut SemAct { + match self { + Self::Empty(_, semact) + | Self::Unexpanded(_, semact) + | Self::Sequence(_, _, _, semact) => semact, + } + } + pub fn append_tailcall(&mut self) { + match self { + Self::Empty(_actions, _) => { + unreachable!("empty cannot be tail called, otherwise there will be ambiguity") + } + Self::Unexpanded(actions, _) => { + actions.push(Action::TailCall); + } + Self::Sequence(_, _, actions, _) => { + actions.push(Action::TailCall); + } + } + } + pub fn append_pass_collector(&mut self, tag: Tag) { + match self { + Self::Empty(_actions, _) => { + unreachable!("empty cannot be followed by another subroutine, otherwise there will be ambiguity") + } + Self::Unexpanded(actions, _) => { + actions.push(Action::PassCollector(tag)); + } + Self::Sequence(_, _, actions, _) => { + actions.push(Action::PassCollector(tag)); + } + } + } pub fn visible_bindings(&self, skip: usize) -> Vec<(&Ident, BoundTarget)> { match self { Self::Empty(actions, _) => actions @@ -143,6 +179,7 @@ impl NormalForm { } break; } + Action::PassCollector(..) => continue, Action::TailCall => continue, } } @@ -224,6 +261,7 @@ fn debug_print_test() { println!("{}", sequence); } +#[derive(Default, Clone)] /// Well, it is not the notorius firewall. pub struct NFTable(HashMap>); @@ -235,6 +273,12 @@ impl Deref for NFTable { } } +impl DerefMut for NFTable { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + #[cfg(feature = "debug")] impl std::fmt::Display for NFTable { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { diff --git a/pag-parser2/src/nf/translation.rs b/pag-parser2/src/nf/translation.rs index 17d22bf..90a8823 100644 --- a/pag-parser2/src/nf/translation.rs +++ b/pag-parser2/src/nf/translation.rs @@ -7,13 +7,15 @@ use std::collections::HashMap; use quote::format_ident; use syn::{Ident, Type}; +use super::NFTable; use super::{semact::SemAct, Action, NormalForm, Tag}; use crate::frontend::RightDeepIterator; use crate::frontend::{ParserDef, ParserExpr}; +#[derive(Default)] struct Translation { /// Table of semi-normalized production rules - semi_nfs: HashMap>, + semi_nfs: NFTable, /// Toplevel type annotations annotations: HashMap, /// Type hints when calling inner routines (collector) @@ -23,274 +25,162 @@ struct Translation { /// Counter of assigned anonymous routines anonymous_cnt: usize, /// Whether we are currently ignoring the output - ignoring: bool, + ignoring_cnt: usize, } impl Translation { + /// Enter ignoring mode fn start_ignoring(&mut self) { - self.ignoring = true; + self.ignoring_cnt += 1; } - fn end_ignore(&mut self) { - self.ignoring = false; + /// Exit ignoring mode + fn end_ignoring(&mut self) { + self.ignoring_cnt -= 1; } - // Allocate a new symbol for unamed variable bindings. + fn ignoring(&mut self) -> bool { + self.ignoring_cnt > 0 + } + + /// Allocate a new symbol for unamed variable bindings. fn new_output_sym(&mut self) -> Ident { let result = format_ident!("_{}", self.output_cnt); self.output_cnt += 1; result } - // Allocate a new tag for anonymous routines. + /// Allocate a new tag for anonymous routines. fn new_anonymous_tag(&mut self) -> Tag { let result = Tag::Anonymous(self.anonymous_cnt); self.anonymous_cnt += 1; result } - fn add_nf(&mut self, tag: Tag, nf: NormalForm) { - self.semi_nfs.entry(tag).or_default().push(nf); - } - fn add_nf_from_anonymous_expr(&mut self, expr: &ParserExpr, tag: &Tag) { - match expr { - ParserExpr::Seq(box ParserExpr::Ignore(box ParserExpr::LexerRef(head)), tail) => { - let tail_actions = RightDeepIterator::from(tail.as_ref()) - .map(|inner| self.add_anonymous_rule(inner)) + /// Construct a normal form from a sequence of parser expressions. The semact is always `Recognize`. + fn partial_nf_from_sequence< + 'a, + const IGNORE_UNNAMED: bool, + I: Iterator)>, + >( + &mut self, + mut iter: I, + ) -> NormalForm { + match iter.next() { + None => NormalForm::Empty(vec![], SemAct::Recognize), + Some((ParserExpr::Ignore(box ParserExpr::LexerRef(token)), _)) => { + let tail = iter + .map(|(inner, named)| self.add_anonymous_rule::(inner, named)) .map(|(tag, output)| Action::Shift { tag, output }) .collect(); - self.add_nf( - tag.clone(), - NormalForm::Sequence( - head.clone(), - None, - tail_actions, - if self.ignoring { - SemAct::Recognize - } else { - SemAct::Gather - }, - ), - ); + NormalForm::Sequence(token.clone(), None, tail, SemAct::Recognize) } - ParserExpr::Seq(box ParserExpr::LexerRef(head), tail) => { - let tail_actions = RightDeepIterator::from(tail.as_ref()) - .map(|inner| self.add_anonymous_rule(inner)) + Some((ParserExpr::LexerRef(token), named)) => { + let tail = iter + .map(|(inner, named)| self.add_anonymous_rule::(inner, named)) .map(|(tag, output)| Action::Shift { tag, output }) .collect(); - let nf = NormalForm::Sequence( - head.clone(), - if self.ignoring { + NormalForm::Sequence( + token.clone(), + if self.ignoring() { None + } else if IGNORE_UNNAMED { + named } else { - Some(self.new_output_sym()) - }, - tail_actions, - if self.ignoring { - SemAct::Recognize - } else { - SemAct::Gather + named.or_else(|| Some(self.new_output_sym())) }, - ); - self.add_nf(tag.clone(), nf); + tail, + SemAct::Recognize, + ) } - ParserExpr::Seq(_, _) => { - let actions = RightDeepIterator::from(expr) - .map(|inner| self.add_anonymous_rule(inner)) + Some((expr, named)) => { + let sequence = [(expr, named)] + .into_iter() + .chain(iter) + .map(|(inner, named)| self.add_anonymous_rule::(inner, named)) .map(|(tag, output)| Action::Shift { tag, output }) .collect(); - let nf = NormalForm::Unexpanded( - actions, - if self.ignoring { - SemAct::Recognize - } else { - SemAct::Gather - }, - ); - self.add_nf(tag.clone(), nf); - } - ParserExpr::Opt(box ParserExpr::Seq( - box ParserExpr::Ignore(box ParserExpr::LexerRef(head)), - tail, - )) => { - let tail_actions = RightDeepIterator::from(tail.as_ref()) - .map(|inner| self.add_anonymous_rule(inner)) - .map(|(tag, output)| Action::Shift { tag, output }) - .collect(); - self.add_nf( - tag.clone(), - NormalForm::Sequence( - head.clone(), - None, - tail_actions, - if self.ignoring { - SemAct::Recognize - } else { - SemAct::Option - }, - ), - ); - self.add_nf(tag.clone(), NormalForm::Empty(vec![], SemAct::Option)); + NormalForm::Unexpanded(sequence, SemAct::Recognize) } - ParserExpr::Opt(box ParserExpr::Seq(box ParserExpr::LexerRef(head), tail)) => { - let tail_actions = RightDeepIterator::from(tail.as_ref()) - .map(|inner| self.add_anonymous_rule(inner)) - .map(|(tag, output)| Action::Shift { tag, output }) - .collect(); - let nf = NormalForm::Sequence( - head.clone(), - if self.ignoring { - None - } else { - Some(self.new_output_sym()) - }, - tail_actions, - if self.ignoring { - SemAct::Recognize - } else { - SemAct::Option - }, + } + } + fn add_nf(&mut self, tag: Tag, nf: NormalForm) { + self.semi_nfs.entry(tag).or_default().push(nf); + } + fn add_nf_from_anonymous_expr(&mut self, expr: &ParserExpr, tag: &Tag) { + match expr { + ParserExpr::Seq(..) => { + let mut partial_nf = self.partial_nf_from_sequence::( + RightDeepIterator::from(expr).map(|expr| (expr, None)), ); - self.add_nf(tag.clone(), nf); - self.add_nf(tag.clone(), NormalForm::Empty(vec![], SemAct::Option)); + *partial_nf.semact_mut() = if self.ignoring() { + SemAct::Recognize + } else { + SemAct::Gather + }; + self.add_nf(tag.clone(), partial_nf); } ParserExpr::Opt(inner) => { - let actions = RightDeepIterator::from(inner.as_ref()) - .map(|inner| self.add_anonymous_rule(inner)) - .map(|(tag, output)| Action::Shift { tag, output }) - .collect(); - let nf = NormalForm::Unexpanded( - actions, - if self.ignoring { - SemAct::Recognize - } else { - SemAct::Option - }, - ); - self.add_nf(tag.clone(), nf); - self.add_nf(tag.clone(), NormalForm::Empty(vec![], SemAct::Option)); - } - ParserExpr::Star(box ParserExpr::Seq( - box ParserExpr::Ignore(box ParserExpr::LexerRef(head)), - tail, - )) => { - let tail_actions = RightDeepIterator::from(tail.as_ref()) - .map(|inner| self.add_anonymous_rule(inner)) - .map(|(tag, output)| Action::Shift { tag, output }) - .collect(); - self.add_nf( - tag.clone(), - NormalForm::Sequence( - head.clone(), - None, - tail_actions, - if self.ignoring { - SemAct::Recognize - } else { - SemAct::ZeroOrMore - }, - ), + let mut partial_nf = self.partial_nf_from_sequence::( + RightDeepIterator::from(inner.as_ref()).map(|expr| (expr, None)), ); - self.add_nf(tag.clone(), NormalForm::Empty(vec![], SemAct::ZeroOrMore)); - } - ParserExpr::Star(box ParserExpr::Seq(box ParserExpr::LexerRef(head), tail)) => { - let tail_actions = RightDeepIterator::from(tail.as_ref()) - .map(|inner| self.add_anonymous_rule(inner)) - .map(|(tag, output)| Action::Shift { tag, output }) - .collect(); - let nf = NormalForm::Sequence( - head.clone(), - if self.ignoring { - None - } else { - Some(self.new_output_sym()) - }, - tail_actions, - if self.ignoring { - SemAct::Recognize - } else { - SemAct::ZeroOrMore - }, - ); - self.add_nf(tag.clone(), nf); - self.add_nf(tag.clone(), NormalForm::Empty(vec![], SemAct::ZeroOrMore)); + let semact = if self.ignoring() { + SemAct::Recognize + } else { + SemAct::Option + }; + *partial_nf.semact_mut() = semact.clone(); + self.add_nf(tag.clone(), partial_nf); + // add one more rule for empty + self.add_nf(tag.clone(), NormalForm::Empty(vec![], semact)); } ParserExpr::Star(inner) => { - let actions = RightDeepIterator::from(inner.as_ref()) - .map(|inner| self.add_anonymous_rule(inner)) - .map(|(tag, output)| Action::Shift { tag, output }) - .collect(); - let nf = NormalForm::Unexpanded( - actions, - if self.ignoring { - SemAct::Recognize - } else { - SemAct::ZeroOrMore - }, + let mut partial_nf = self.partial_nf_from_sequence::( + RightDeepIterator::from(inner.as_ref()).map(|expr| (expr, None)), ); - self.add_nf(tag.clone(), nf); - self.add_nf(tag.clone(), NormalForm::Empty(vec![], SemAct::ZeroOrMore)); + let semact = if self.ignoring() { + SemAct::Recognize + } else { + SemAct::ZeroOrMore + }; + *partial_nf.semact_mut() = semact.clone(); + self.add_nf(tag.clone(), partial_nf); + // add one more rule for empty + self.add_nf(tag.clone(), NormalForm::Empty(vec![], semact)); } - ParserExpr::Plus(box ParserExpr::Seq( - box ParserExpr::Ignore(box ParserExpr::LexerRef(head)), - tail, - )) => { - let tail_actions = RightDeepIterator::from(tail.as_ref()) - .map(|inner| self.add_anonymous_rule(inner)) - .map(|(tag, output)| Action::Shift { tag, output }) - .collect(); - self.add_nf( - tag.clone(), - NormalForm::Sequence( - head.clone(), - None, - tail_actions, - if self.ignoring { - SemAct::Recognize - } else { - SemAct::OneOrMoreToplevel - }, - ), + ParserExpr::Plus(inner) => { + let mut partial_nf = self.partial_nf_from_sequence::( + RightDeepIterator::from(inner.as_ref()).map(|expr| (expr, None)), ); - self.add_nf(tag.clone(), NormalForm::Empty(vec![], SemAct::ZeroOrMore)); - } - ParserExpr::Plus(box ParserExpr::Seq(box ParserExpr::LexerRef(head), tail)) => { - let tail_actions = RightDeepIterator::from(tail.as_ref()) - .map(|inner| self.add_anonymous_rule(inner)) - .map(|(tag, output)| Action::Shift { tag, output }) - .collect(); - let nf = NormalForm::Sequence( - head.clone(), - if self.ignoring { - None - } else { - Some(self.new_output_sym()) - }, - tail_actions, - if self.ignoring { + let nested_tag = self.new_anonymous_tag(); + // the nested routine + { + let semact = if self.ignoring() { SemAct::Recognize } else { - SemAct::ZeroOrMore - }, - ); - self.add_nf(tag.clone(), nf); - self.add_nf(tag.clone(), NormalForm::Empty(vec![], SemAct::ZeroOrMore)); - } - ParserExpr::Plus(inner) => { - let actions = RightDeepIterator::from(inner.as_ref()) - .map(|inner| self.add_anonymous_rule(inner)) - .map(|(tag, output)| Action::Shift { tag, output }) - .collect(); - let nf = NormalForm::Unexpanded( - actions, - if self.ignoring { + SemAct::OneOrMoreNested + }; + + self.add_nf(nested_tag.clone(), { + let mut nf = partial_nf.clone(); + nf.append_tailcall(); + *nf.semact_mut() = semact.clone(); + nf + }); + + self.add_nf(nested_tag.clone(), NormalForm::Empty(vec![], semact)); + } + // the toplevel routine + { + let semact = if self.ignoring() { SemAct::Recognize } else { - SemAct::ZeroOrMore - }, - ); - self.add_nf(tag.clone(), nf); - self.add_nf(tag.clone(), NormalForm::Empty(vec![], SemAct::ZeroOrMore)); + SemAct::OneOrMoreToplevel + }; + partial_nf.append_pass_collector(nested_tag); + *partial_nf.semact_mut() = semact; + self.add_nf(tag.clone(), partial_nf); + } } ParserExpr::LexerRef(ident) => { - let nf = if self.ignoring { + let nf = if self.ignoring() { NormalForm::Sequence(ident.clone(), None, vec![], SemAct::Recognize) } else { NormalForm::Sequence( @@ -307,8 +197,49 @@ impl Translation { ParserExpr::Hinted(_, _) => unreachable!("cannot create nf from hinted"), } } - fn add_anonymous_rule(&mut self, expr: &ParserExpr) -> (Tag, Option) { - todo!() + fn add_anonymous_rule( + &mut self, + expr: &ParserExpr, + named: Option, + ) -> (Tag, Option) { + let is_unnamed = named.is_none(); + if IGNORE_UNNAMED && is_unnamed { + self.start_ignoring(); + } + let result = match expr { + ParserExpr::ParserRef(x) => { + let tag = Tag::Toplevel(x.clone()); + if self.ignoring() { + (tag, None) + } else { + (tag, named.or_else(|| Some(self.new_output_sym()))) + } + } + ParserExpr::Ignore(expr) => { + self.start_ignoring(); + let (tag, output) = self.add_anonymous_rule::(expr, named); + self.end_ignoring(); + (tag, output) + } + ParserExpr::Hinted(expr, hint) => { + let (tag, output) = self.add_anonymous_rule::(expr, named); + self.hints.insert(tag.clone(), hint.clone()); + (tag, output) + } + _ => { + let tag = self.new_anonymous_tag(); + self.add_nf_from_anonymous_expr(expr, &tag); + if self.ignoring() { + (tag, None) + } else { + (tag, named.or_else(|| Some(self.new_output_sym()))) + } + } + }; + if IGNORE_UNNAMED && is_unnamed { + self.end_ignoring(); + } + result } // Translate a top-level definition @@ -319,20 +250,65 @@ impl Translation { .rules .iter() .map(|rule| { - let semact = if let Some(x) = rule.action.clone() { - SemAct::CustomizedRoutine(x) + let semact = if let Some(action) = &rule.action { + SemAct::CustomizedRoutine(action.clone()) } else if rule.vars.len() == 1 { SemAct::infer(&rule.vars[0].expr) } else { SemAct::Gather }; - match semact { - SemAct::Gather | SemAct::CustomizedRoutine(_) => {} - _ => {} - } - todo!() + let mut partial_nf = if matches!(semact, SemAct::CustomizedRoutine(..)) { + self.partial_nf_from_sequence::( + rule.vars + .iter() + .map(|binding| (&binding.expr, binding.name.clone())), + ) + } else { + self.partial_nf_from_sequence::( + rule.vars + .iter() + .map(|binding| (&binding.expr, binding.name.clone())), + ) + }; + *partial_nf.semact_mut() = semact; + partial_nf }) .collect(); self.semi_nfs.insert(tag, rules); } } + +#[cfg(test)] +mod test { + use crate::frontend::Ast; + + use super::Translation; + + #[test] + fn sexpr() { + let ast = syn::parse_str::( + r#" + %entry = sexp; + + DIGIT = '0'..'9'; + ALPHA = 'a'..'z' | 'A'..'Z'; + LPAREN = "("; + RPAREN = ")"; + ATOM = ALPHA (ALPHA | DIGIT)*; + %skip = (" " | "\t" | "\n" | "\r")+; + + compound: SExp = LPAREN sexp+[sexp:Vec<_>] RPAREN { SExp::Compound(sexp) }; + atom : SExp = ATOM[atom] { SExp::Atom(atom) }; + sexp : SExp = compound + | atom; + "#, + ) + .unwrap(); + let mut translation = Translation::default(); + for (name, def) in ast.parser_map.iter() { + translation.add_toplevel_def(name.clone(), def); + } + #[cfg(feature = "debug")] + println!("{}", translation.semi_nfs); + } +} From c00774eb8abe567cd281a5fe9fd947c4dd34b7cf Mon Sep 17 00:00:00 2001 From: QuarticCat Date: Sun, 30 Jul 2023 11:48:46 +0800 Subject: [PATCH 34/42] disallow # in toplevel --- pag-lexer/src/lookahead.rs | 2 +- pag-parser2/src/frontend/parse.rs | 20 +++++++++++--------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/pag-lexer/src/lookahead.rs b/pag-lexer/src/lookahead.rs index c1cad8c..1db0c39 100644 --- a/pag-lexer/src/lookahead.rs +++ b/pag-lexer/src/lookahead.rs @@ -157,7 +157,7 @@ impl LoopOptimizer { let table_size = self.global_lut.len(); let table = self.global_lut.iter().map(|x| quote!([#(#x,)*])); Some(quote! { - const GLOBAL_LUT : [[u8; 256]; #table_size] = [ #(#table,)* ]; + const GLOBAL_LUT: [[u8; 256]; #table_size] = [ #(#table,)* ]; }) } diff --git a/pag-parser2/src/frontend/parse.rs b/pag-parser2/src/frontend/parse.rs index 32bba36..685cc3a 100644 --- a/pag-parser2/src/frontend/parse.rs +++ b/pag-parser2/src/frontend/parse.rs @@ -140,7 +140,7 @@ impl Parse for ParserRule { impl Parse for VarBinding { // ParserExpr ("[" syn::Ident (":" syn::Type)? "]")? fn parse(input: ParseStream) -> Result { - let mut expr = parse_parser_expr(input, 0, true)?; + let mut expr = input.parse::()?; let mut name = None; let mut ty = None; @@ -276,7 +276,7 @@ fn parse_lexer_expr(input: ParseStream, min_bp: u32) -> Result { impl Parse for ParserExpr { fn parse(input: ParseStream) -> Result { - parse_parser_expr(input, 0, false) + parse_parser_expr(input, 0, true) } } @@ -294,24 +294,26 @@ fn parse_parser_expr(input: ParseStream, min_bp: u32, is_toplevel: bool) -> Resu if input.peek(syn::token::Paren) { let content; parenthesized!(content in input); - break 'lhs content.parse::()?; + break 'lhs parse_parser_expr(&content, 0, false)?; } - if input.peek(Token![#]) { + if !is_toplevel && input.peek(Token![#]) { input.parse::()?; let r_bp = 60; - let rhs = parse_parser_expr(input, r_bp, is_toplevel)?; + let rhs = parse_parser_expr(input, r_bp, false)?; break 'lhs ParserExpr::Ignore(Box::new(rhs)); } return Err(input.error("expected parser expression")); }; loop { - if !is_toplevel && (input.peek(syn::Ident) || input.peek(syn::token::Paren) || input.peek(Token![#])) { + if !is_toplevel + && (input.peek(syn::Ident) || input.peek(syn::token::Paren) || input.peek(Token![#])) + { let (l_bp, r_bp) = (40, 41); if l_bp < min_bp { break; } - let rhs = parse_parser_expr(input, r_bp, is_toplevel)?; + let rhs = parse_parser_expr(input, r_bp, false)?; lhs = ParserExpr::Seq(Box::new(lhs), Box::new(rhs)); continue; } @@ -375,8 +377,8 @@ mod test { #[test] fn test_parser_expr() { - syn::parse_str::(r#"A? b c* D+ F?"#).unwrap(); - syn::parse_str::(r#"A? b c* (key value*:Vec<_>)+:HashMap<_, _> F?"#).unwrap(); + syn::parse_str::(r#"(A? b c* D+ F?)"#).unwrap(); + syn::parse_str::(r#"(A? b c* (key value*:Vec<_>)+:HashMap<_, _> F?)"#).unwrap(); } #[test] From 86e93d3e1ad577cfa0cc638fb57a9899e6ebe94f Mon Sep 17 00:00:00 2001 From: QuarticCat Date: Sun, 30 Jul 2023 12:35:16 +0800 Subject: [PATCH 35/42] remove `Hinted` --- pag-parser2/src/frontend/ast.rs | 2 +- pag-parser2/src/frontend/parse.rs | 22 +---- pag-parser2/src/nf/inference.rs | 137 ++++++++++++++---------------- pag-parser2/src/nf/mod.rs | 6 +- pag-parser2/src/nf/translation.rs | 12 +-- 5 files changed, 81 insertions(+), 98 deletions(-) diff --git a/pag-parser2/src/frontend/ast.rs b/pag-parser2/src/frontend/ast.rs index 9877b44..456f8e3 100644 --- a/pag-parser2/src/frontend/ast.rs +++ b/pag-parser2/src/frontend/ast.rs @@ -64,6 +64,7 @@ pub struct ParserRule { pub struct VarBinding { pub expr: ParserExpr, pub name: Option, + pub ty: Option } // TODO: how to express "bottom" & "any"? @@ -89,7 +90,6 @@ pub enum ParserExpr { LexerRef(syn::Ident), ParserRef(syn::Ident), Ignore(Box), - Hinted(Box, syn::Type), } pub struct RightDeepIterator<'a> { diff --git a/pag-parser2/src/frontend/parse.rs b/pag-parser2/src/frontend/parse.rs index 685cc3a..9f18550 100644 --- a/pag-parser2/src/frontend/parse.rs +++ b/pag-parser2/src/frontend/parse.rs @@ -100,7 +100,7 @@ impl Parse for ParserDef { fn parse(input: ParseStream) -> Result { let ty = match input.parse::() { Ok(_) => input.parse::()?, - Err(_) => parse_quote!(&'src str), + Err(_) => parse_quote!(::pag_util::Span<'src>), }; input.parse::()?; @@ -140,7 +140,7 @@ impl Parse for ParserRule { impl Parse for VarBinding { // ParserExpr ("[" syn::Ident (":" syn::Type)? "]")? fn parse(input: ParseStream) -> Result { - let mut expr = input.parse::()?; + let expr = input.parse::()?; let mut name = None; let mut ty = None; @@ -159,10 +159,8 @@ impl Parse for VarBinding { return Err(content.error("expected `]`")); } } - if let Some(ty) = ty { - expr = ParserExpr::Hinted(Box::new(expr), ty); - } - Ok(Self { expr, name }) + + Ok(Self { expr, name, ty }) } } @@ -317,15 +315,6 @@ fn parse_parser_expr(input: ParseStream, min_bp: u32, is_toplevel: bool) -> Resu lhs = ParserExpr::Seq(Box::new(lhs), Box::new(rhs)); continue; } - fn peek_and_parse_type(input: ParseStream, expr: ParserExpr) -> Result { - Ok(if input.peek(Token!(:)) { - input.parse::()?; - let ty = input.parse::()?; - ParserExpr::Hinted(Box::new(expr), ty) - } else { - expr - }) - } if input.peek(Token![*]) { let l_bp = 70; if l_bp < min_bp { @@ -333,7 +322,6 @@ fn parse_parser_expr(input: ParseStream, min_bp: u32, is_toplevel: bool) -> Resu } input.parse::()?; lhs = ParserExpr::Star(Box::new(lhs)); - lhs = peek_and_parse_type(input, lhs)?; continue; } if input.peek(Token![+]) { @@ -343,7 +331,6 @@ fn parse_parser_expr(input: ParseStream, min_bp: u32, is_toplevel: bool) -> Resu } input.parse::()?; lhs = ParserExpr::Plus(Box::new(lhs)); - lhs = peek_and_parse_type(input, lhs)?; continue; } if input.peek(Token![?]) { @@ -378,7 +365,6 @@ mod test { #[test] fn test_parser_expr() { syn::parse_str::(r#"(A? b c* D+ F?)"#).unwrap(); - syn::parse_str::(r#"(A? b c* (key value*:Vec<_>)+:HashMap<_, _> F?)"#).unwrap(); } #[test] diff --git a/pag-parser2/src/nf/inference.rs b/pag-parser2/src/nf/inference.rs index b614faa..ac75948 100644 --- a/pag-parser2/src/nf/inference.rs +++ b/pag-parser2/src/nf/inference.rs @@ -88,23 +88,23 @@ pub enum InferredType { pub struct InferenceContext<'a> { /// Typed tags gamma: HashMap, - /// Type annotations from user (for toplevel) - annotations: &'a HashMap, /// Fully normalized terms nforms: &'a HashMap>, } + impl<'a> InferenceContext<'a> { /// Create a new inference context pub fn new( annotations: &'a HashMap, nforms: &'a HashMap>, ) -> Self { - Self { - gamma: HashMap::new(), - annotations, - nforms, - } + let gamma = annotations + .iter() + .map(|(k, v)| (k.clone(), InferredType::Concrete(v.clone()))) + .collect(); + Self { gamma, nforms } } + fn infer_gather<'i, I: Iterator>>( &mut self, mut tags: I, @@ -113,14 +113,14 @@ impl<'a> InferenceContext<'a> { let mut types = vec![if let BoundTarget::Tag(tag) = tag { self.infer(tag)? } else { - InferredType::Concrete(parse_quote! {::pag_runtime::Span<'src>}) + InferredType::Concrete(parse_quote!(::pag_util::Span<'src>)) }]; for t in tags { // If any inference fails, the whole inference fails let ty = if let BoundTarget::Tag(t) = t { self.infer(t)? } else { - InferredType::Concrete(parse_quote! {::pag_runtime::Span<'src>}) + InferredType::Concrete(parse_quote!(::pag_util::Span<'src>)) }; types.push(ty); } @@ -133,9 +133,10 @@ impl<'a> InferenceContext<'a> { } } else { // no field, unit type - Some(InferredType::Concrete(parse_quote! {()})) + Some(InferredType::Concrete(parse_quote!(()))) } } + /// try infer all types, but may fail with incomplete type information. pub fn infer_all_types(mut self) -> HashMap { let mut typed = 0; @@ -149,79 +150,71 @@ impl<'a> InferenceContext<'a> { } self.gamma } + fn infer(&mut self, tag: &Tag) -> Option { if let Some(x) = self.gamma.get(tag) { return Some(x.clone()); } - let target = if let Some(x) = self.annotations.get(tag) { - // If a concrete type annotation is provided, use it directly - InferredType::Concrete(x.clone()) - } else { - // find first subexpression that fulfills inference - let nfs = self.nforms.get(tag)?; - let mut inferred = None; - for i in nfs.iter() { - let semact = i.semact(); - match semact { - SemAct::Recognize => { - inferred.replace(InferredType::Concrete(parse_quote!(()))); - break; - } - // Token semantic action, the type is Span - SemAct::Token => { - inferred.replace(InferredType::Concrete(parse_quote!( - ::pag_runtime::Span<'src> - ))); - break; - } - // Customized routine without type annotation, cannot infer - SemAct::CustomizedRoutine(..) => continue, - // Nested routine for one or more, the type is unit. - SemAct::OneOrMoreNested => { - inferred.replace(InferredType::Concrete(parse_quote!(()))); + + // Find first subexpression that fulfills inference + let nfs = self.nforms.get(tag)?; + let mut inferred = None; + for i in nfs.iter() { + let semact = i.semact(); + match semact { + SemAct::Recognize => { + inferred.replace(InferredType::Concrete(parse_quote!(()))); + break; + } + // Token semantic action, the type is Span + SemAct::Token => { + inferred.replace(InferredType::Concrete(parse_quote!(::pag_util::Span<'src>))); + break; + } + // Customized routine without type annotation, cannot infer + SemAct::CustomizedRoutine(..) => continue, + // Nested routine for one or more, the type is unit. + SemAct::OneOrMoreNested => { + inferred.replace(InferredType::Concrete(parse_quote!(()))); + break; + } + SemAct::Gather => { + let visible = i.visible_bindings(0); + if let Some(gather_type) = self.infer_gather(visible.into_iter().map(|x| x.1)) { + inferred.replace(gather_type); break; } - SemAct::Gather => { - let visible = i.visible_bindings(0); - if let Some(gather_type) = - self.infer_gather(visible.into_iter().map(|x| x.1)) - { - inferred.replace(gather_type); - break; - } - } - SemAct::ZeroOrMore | SemAct::Option | SemAct::OneOrMoreToplevel => { - let mapper = |ty: InferredType| { - if matches!(semact, SemAct::Option) { - InferredType::Option(Box::new(ty)) - } else { - InferredType::Collector(Box::new(ty)) - } - }; - // Skip epsilon production, this is safe since OneOrMoreToplevel will never be empty - if let NormalForm::Empty(x, _) = i { - if x.is_empty() { - continue; - } + } + SemAct::ZeroOrMore | SemAct::Option | SemAct::OneOrMoreToplevel => { + let mapper = |ty: InferredType| { + if matches!(semact, SemAct::Option) { + InferredType::Option(Box::new(ty)) + } else { + InferredType::Collector(Box::new(ty)) } - // skip the trailing part of OneOrMoreToplevel - let visible = - i.visible_bindings(if matches!(semact, SemAct::OneOrMoreToplevel) { - 1 - } else { - 0 - }); - if let Some(gather_type) = - self.infer_gather(visible.into_iter().map(|x| x.1)) - { - inferred.replace(mapper(gather_type)); - break; + }; + // Skip epsilon production, this is safe since OneOrMoreToplevel will never be empty + if let NormalForm::Empty(x, _) = i { + if x.is_empty() { + continue; } } + // skip the trailing part of OneOrMoreToplevel + let visible = + i.visible_bindings(if matches!(semact, SemAct::OneOrMoreToplevel) { + 1 + } else { + 0 + }); + if let Some(gather_type) = self.infer_gather(visible.into_iter().map(|x| x.1)) { + inferred.replace(mapper(gather_type)); + break; + } } } - inferred? - }; + } + + let target = inferred?; self.gamma.insert(tag.clone(), target.clone()); Some(target) } diff --git a/pag-parser2/src/nf/mod.rs b/pag-parser2/src/nf/mod.rs index f1e1ceb..46b16ae 100644 --- a/pag-parser2/src/nf/mod.rs +++ b/pag-parser2/src/nf/mod.rs @@ -124,6 +124,7 @@ impl NormalForm { | Self::Sequence(_, _, _, semact) => semact, } } + pub fn semact_mut(&mut self) -> &mut SemAct { match self { Self::Empty(_, semact) @@ -131,6 +132,7 @@ impl NormalForm { | Self::Sequence(_, _, _, semact) => semact, } } + pub fn append_tailcall(&mut self) { match self { Self::Empty(_actions, _) => { @@ -144,6 +146,7 @@ impl NormalForm { } } } + pub fn append_pass_collector(&mut self, tag: Tag) { match self { Self::Empty(_actions, _) => { @@ -157,6 +160,7 @@ impl NormalForm { } } } + pub fn visible_bindings(&self, skip: usize) -> Vec<(&Ident, BoundTarget)> { match self { Self::Empty(actions, _) => actions @@ -261,8 +265,8 @@ fn debug_print_test() { println!("{}", sequence); } -#[derive(Default, Clone)] /// Well, it is not the notorius firewall. +#[derive(Default, Clone)] pub struct NFTable(HashMap>); impl Deref for NFTable { diff --git a/pag-parser2/src/nf/translation.rs b/pag-parser2/src/nf/translation.rs index 90a8823..7a1120b 100644 --- a/pag-parser2/src/nf/translation.rs +++ b/pag-parser2/src/nf/translation.rs @@ -33,6 +33,7 @@ impl Translation { fn start_ignoring(&mut self) { self.ignoring_cnt += 1; } + /// Exit ignoring mode fn end_ignoring(&mut self) { self.ignoring_cnt -= 1; @@ -48,12 +49,14 @@ impl Translation { self.output_cnt += 1; result } + /// Allocate a new tag for anonymous routines. fn new_anonymous_tag(&mut self) -> Tag { let result = Tag::Anonymous(self.anonymous_cnt); self.anonymous_cnt += 1; result } + /// Construct a normal form from a sequence of parser expressions. The semact is always `Recognize`. fn partial_nf_from_sequence< 'a, @@ -101,9 +104,11 @@ impl Translation { } } } + fn add_nf(&mut self, tag: Tag, nf: NormalForm) { self.semi_nfs.entry(tag).or_default().push(nf); } + fn add_nf_from_anonymous_expr(&mut self, expr: &ParserExpr, tag: &Tag) { match expr { ParserExpr::Seq(..) => { @@ -194,9 +199,9 @@ impl Translation { } ParserExpr::ParserRef(_) => unreachable!("cannot create nf from parser ref"), ParserExpr::Ignore(_) => unreachable!("cannot create nf from ignore"), - ParserExpr::Hinted(_, _) => unreachable!("cannot create nf from hinted"), } } + fn add_anonymous_rule( &mut self, expr: &ParserExpr, @@ -221,11 +226,6 @@ impl Translation { self.end_ignoring(); (tag, output) } - ParserExpr::Hinted(expr, hint) => { - let (tag, output) = self.add_anonymous_rule::(expr, named); - self.hints.insert(tag.clone(), hint.clone()); - (tag, output) - } _ => { let tag = self.new_anonymous_tag(); self.add_nf_from_anonymous_expr(expr, &tag); From f03a5991a5fb0ac8509a6e02e50e453780ed8096 Mon Sep 17 00:00:00 2001 From: QuarticCat Date: Sun, 30 Jul 2023 14:42:34 +0800 Subject: [PATCH 36/42] reformat --- pag-parser2/src/frontend/ast.rs | 19 +++++++++---------- pag-parser2/src/frontend/parse.rs | 4 +--- pag-parser2/src/nf/inference.rs | 2 +- pag-parser2/src/nf/semact.rs | 6 +++--- pag-parser2/src/nf/translation.rs | 4 ++-- 5 files changed, 16 insertions(+), 19 deletions(-) diff --git a/pag-parser2/src/frontend/ast.rs b/pag-parser2/src/frontend/ast.rs index 456f8e3..fb40153 100644 --- a/pag-parser2/src/frontend/ast.rs +++ b/pag-parser2/src/frontend/ast.rs @@ -17,30 +17,29 @@ pub struct Ast { } #[derive(Clone)] -#[repr(transparent)] -pub struct CustomizedBlock(pub Rc); +pub struct CodeBlock(pub Rc); -impl PartialEq for CustomizedBlock { +impl PartialEq for CodeBlock { fn eq(&self, other: &Self) -> bool { Rc::ptr_eq(&self.0, &other.0) } } -impl Eq for CustomizedBlock {} +impl Eq for CodeBlock {} -impl PartialOrd for CustomizedBlock { +impl PartialOrd for CodeBlock { fn partial_cmp(&self, other: &Self) -> Option { Rc::as_ptr(&self.0).partial_cmp(&Rc::as_ptr(&other.0)) } } -impl Ord for CustomizedBlock { +impl Ord for CodeBlock { fn cmp(&self, other: &Self) -> std::cmp::Ordering { Rc::as_ptr(&self.0).cmp(&Rc::as_ptr(&other.0)) } } -impl std::hash::Hash for CustomizedBlock { +impl std::hash::Hash for CodeBlock { fn hash(&self, state: &mut H) { Rc::as_ptr(&self.0).hash(state) } @@ -52,19 +51,19 @@ pub struct LexerDef { } pub struct ParserDef { - pub ty: syn::Type, + pub ty: syn::Type, // TODO: syn::Type is huge, maybe we should box it or only keep the span pub rules: Vec, } pub struct ParserRule { pub vars: Vec, - pub action: Option, + pub action: Option, } pub struct VarBinding { pub expr: ParserExpr, pub name: Option, - pub ty: Option + pub ty: Option, // TODO: syn::Type is huge, maybe we should box it or only keep the span } // TODO: how to express "bottom" & "any"? diff --git a/pag-parser2/src/frontend/parse.rs b/pag-parser2/src/frontend/parse.rs index 9f18550..1758763 100644 --- a/pag-parser2/src/frontend/parse.rs +++ b/pag-parser2/src/frontend/parse.rs @@ -128,9 +128,7 @@ impl Parse for ParserRule { let mut action = None; if input.peek(syn::token::Brace) { - action = Some(CustomizedBlock(std::rc::Rc::new( - input.parse::()?, - ))); + action = Some(CodeBlock(std::rc::Rc::new(input.parse::()?))); } Ok(Self { vars, action }) diff --git a/pag-parser2/src/nf/inference.rs b/pag-parser2/src/nf/inference.rs index ac75948..d2d9f86 100644 --- a/pag-parser2/src/nf/inference.rs +++ b/pag-parser2/src/nf/inference.rs @@ -172,7 +172,7 @@ impl<'a> InferenceContext<'a> { break; } // Customized routine without type annotation, cannot infer - SemAct::CustomizedRoutine(..) => continue, + SemAct::Customized(..) => continue, // Nested routine for one or more, the type is unit. SemAct::OneOrMoreNested => { inferred.replace(InferredType::Concrete(parse_quote!(()))); diff --git a/pag-parser2/src/nf/semact.rs b/pag-parser2/src/nf/semact.rs index 9dec982..9e44a4f 100644 --- a/pag-parser2/src/nf/semact.rs +++ b/pag-parser2/src/nf/semact.rs @@ -6,7 +6,7 @@ // option. All files in the project carrying such notice may not be copied, // modified, or distributed except according to those terms. -use crate::frontend::{CustomizedBlock, ParserExpr}; +use crate::frontend::{CodeBlock, ParserExpr}; /// /// ``` @@ -19,7 +19,7 @@ use crate::frontend::{CustomizedBlock, ParserExpr}; // those normal form without SemAct will be treated as plain scanner. #[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] pub enum SemAct { - CustomizedRoutine(CustomizedBlock), + Customized(CodeBlock), /// Gather inner data. If multiple is selected, return a tuple. /// If only one is selected, return target data. Gather, @@ -57,7 +57,7 @@ impl SemAct { impl std::fmt::Display for SemAct { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { - SemAct::CustomizedRoutine(x) => write!(f, "{:?}", std::rc::Rc::as_ptr(&x.0)), + SemAct::Customized(x) => write!(f, "{:?}", std::rc::Rc::as_ptr(&x.0)), SemAct::Gather => write!(f, "Gather"), SemAct::Option => write!(f, "Option"), SemAct::ZeroOrMore => write!(f, "ZeroOrMore"), diff --git a/pag-parser2/src/nf/translation.rs b/pag-parser2/src/nf/translation.rs index 7a1120b..e12adb2 100644 --- a/pag-parser2/src/nf/translation.rs +++ b/pag-parser2/src/nf/translation.rs @@ -251,13 +251,13 @@ impl Translation { .iter() .map(|rule| { let semact = if let Some(action) = &rule.action { - SemAct::CustomizedRoutine(action.clone()) + SemAct::Customized(action.clone()) } else if rule.vars.len() == 1 { SemAct::infer(&rule.vars[0].expr) } else { SemAct::Gather }; - let mut partial_nf = if matches!(semact, SemAct::CustomizedRoutine(..)) { + let mut partial_nf = if matches!(semact, SemAct::Customized(..)) { self.partial_nf_from_sequence::( rule.vars .iter() From b545ec31a62ea1ae150e77941a7c6ae12179894f Mon Sep 17 00:00:00 2001 From: QuarticCat Date: Mon, 31 Jul 2023 02:57:15 +0800 Subject: [PATCH 37/42] parse ParserExpr::Seq to vector --- pag-parser2/src/frontend/ast.rs | 45 +++++++------------------------ pag-parser2/src/frontend/parse.rs | 21 ++++++++------- pag-parser2/src/lib.rs | 2 +- 3 files changed, 23 insertions(+), 45 deletions(-) diff --git a/pag-parser2/src/frontend/ast.rs b/pag-parser2/src/frontend/ast.rs index fb40153..00ef1b5 100644 --- a/pag-parser2/src/frontend/ast.rs +++ b/pag-parser2/src/frontend/ast.rs @@ -9,13 +9,6 @@ use std::collections::HashMap; use std::rc::Rc; -pub struct Ast { - pub entry: syn::Ident, - pub skip: Option, - pub lexer_map: HashMap, - pub parser_map: HashMap, -} - #[derive(Clone)] pub struct CodeBlock(pub Rc); @@ -45,13 +38,20 @@ impl std::hash::Hash for CodeBlock { } } +pub struct Ast { + pub entry: syn::Ident, + pub skip: Option, + pub lexer_map: HashMap, + pub parser_map: HashMap, +} + pub struct LexerDef { pub idx: u32, pub expr: LexerExpr, } pub struct ParserDef { - pub ty: syn::Type, // TODO: syn::Type is huge, maybe we should box it or only keep the span + pub ty: Rc, pub rules: Vec, } @@ -63,7 +63,7 @@ pub struct ParserRule { pub struct VarBinding { pub expr: ParserExpr, pub name: Option, - pub ty: Option, // TODO: syn::Type is huge, maybe we should box it or only keep the span + pub ty: Option>, } // TODO: how to express "bottom" & "any"? @@ -82,7 +82,7 @@ pub enum LexerExpr { // TODO: how to express "select" & "ignore"? pub enum ParserExpr { - Seq(Box, Box), + Seq(Vec), Star(Box), Plus(Box), Opt(Box), @@ -90,28 +90,3 @@ pub enum ParserExpr { ParserRef(syn::Ident), Ignore(Box), } - -pub struct RightDeepIterator<'a> { - seq: Option<&'a ParserExpr>, -} - -impl<'a> From<&'a ParserExpr> for RightDeepIterator<'a> { - fn from(expr: &'a ParserExpr) -> Self { - Self { seq: Some(expr) } - } -} - -impl<'a> Iterator for RightDeepIterator<'a> { - type Item = &'a ParserExpr; - - fn next(&mut self) -> Option { - match self.seq { - Some(ParserExpr::Seq(a, b)) => { - self.seq = Some(b); - Some(a) - } - Some(_) => self.seq.take(), - None => None, - } - } -} diff --git a/pag-parser2/src/frontend/parse.rs b/pag-parser2/src/frontend/parse.rs index 1758763..c082c49 100644 --- a/pag-parser2/src/frontend/parse.rs +++ b/pag-parser2/src/frontend/parse.rs @@ -13,6 +13,7 @@ use syn::parse::{Parse, ParseStream}; use syn::{bracketed, parenthesized, parse_quote, Error, Result, Token}; use std::collections::HashMap; +use std::rc::Rc; #[derive(PartialEq, Eq)] enum IdentKind { @@ -98,10 +99,10 @@ impl Parse for Ast { impl Parse for ParserDef { // (":" syn::Type)? = (ParserRule)|+ fn parse(input: ParseStream) -> Result { - let ty = match input.parse::() { + let ty = Rc::new(match input.parse::() { Ok(_) => input.parse::()?, Err(_) => parse_quote!(::pag_util::Span<'src>), - }; + }); input.parse::()?; @@ -150,7 +151,7 @@ impl Parse for VarBinding { if content.peek(Token![:]) { content.parse::()?; - ty = Some(content.parse::()?); + ty = Some(Rc::new(content.parse::()?)); } if !content.is_empty() { @@ -305,13 +306,15 @@ fn parse_parser_expr(input: ParseStream, min_bp: u32, is_toplevel: bool) -> Resu if !is_toplevel && (input.peek(syn::Ident) || input.peek(syn::token::Paren) || input.peek(Token![#])) { - let (l_bp, r_bp) = (40, 41); - if l_bp < min_bp { - break; + let mut seq = vec![lhs]; + while input.peek(syn::Ident) || input.peek(syn::token::Paren) || input.peek(Token![#]) { + let (l_bp, r_bp) = (40, 41); + if l_bp < min_bp { + break; + } + seq.push(parse_parser_expr(input, r_bp, false)?); } - let rhs = parse_parser_expr(input, r_bp, false)?; - lhs = ParserExpr::Seq(Box::new(lhs), Box::new(rhs)); - continue; + return Ok(ParserExpr::Seq(seq)); } if input.peek(Token![*]) { let l_bp = 70; diff --git a/pag-parser2/src/lib.rs b/pag-parser2/src/lib.rs index 932e93c..a5b4fe7 100644 --- a/pag-parser2/src/lib.rs +++ b/pag-parser2/src/lib.rs @@ -10,4 +10,4 @@ #[cfg(feature = "debug")] mod debug; mod frontend; -mod nf; +// mod nf; From dc192386d872e8714858a545b0f7e73a38cb45a1 Mon Sep 17 00:00:00 2001 From: Schrodinger ZHU Yifan Date: Sun, 30 Jul 2023 17:09:30 -0400 Subject: [PATCH 38/42] embed type info into NF --- pag-parser2/src/debug.rs | 22 --- pag-parser2/src/lib.rs | 6 +- pag-parser2/src/nf/inference.rs | 221 ------------------------------ pag-parser2/src/nf/mod.rs | 202 ++++++++++++++------------- pag-parser2/src/nf/translation.rs | 1 - pag-parser2/src/utils.rs | 70 ++++++++++ 6 files changed, 182 insertions(+), 340 deletions(-) delete mode 100644 pag-parser2/src/debug.rs delete mode 100644 pag-parser2/src/nf/inference.rs create mode 100644 pag-parser2/src/utils.rs diff --git a/pag-parser2/src/debug.rs b/pag-parser2/src/debug.rs deleted file mode 100644 index fc99fdb..0000000 --- a/pag-parser2/src/debug.rs +++ /dev/null @@ -1,22 +0,0 @@ -#[cfg(feature = "ansi-debug")] -macro_rules! styled { - ($style:expr, $($arg:tt)*) => { - { - use nu_ansi_term::*; - $style.paint(format!($($arg)*)) - } - }; -} -#[cfg(not(feature = "ansi-debug"))] -macro_rules! styled { - ($style:expr, $($arg:tt)*) => {format!($($arg)*)}; -} - -macro_rules! styled_write { - ($dst:expr, $($arg:tt)*) => { - write!($dst, "{}", $crate::debug::styled!($($arg)*)) - }; -} - -pub(crate) use styled; -pub(crate) use styled_write; diff --git a/pag-parser2/src/lib.rs b/pag-parser2/src/lib.rs index a5b4fe7..781a7a8 100644 --- a/pag-parser2/src/lib.rs +++ b/pag-parser2/src/lib.rs @@ -6,8 +6,6 @@ // option. All files in the project carrying such notice may not be copied, // modified, or distributed except according to those terms. -#![feature(box_patterns)] -#[cfg(feature = "debug")] -mod debug; mod frontend; -// mod nf; +mod nf; +mod utils; diff --git a/pag-parser2/src/nf/inference.rs b/pag-parser2/src/nf/inference.rs deleted file mode 100644 index d2d9f86..0000000 --- a/pag-parser2/src/nf/inference.rs +++ /dev/null @@ -1,221 +0,0 @@ -// If there is no semantic action, the routine is plain scan over. Thus, the type is unit. -// ⊢ x = ..., SemAct[x] = ∅ -// ------------------- -// ⊢ x : () - -// A Customized Routine must have type annotation -// ⊢ x = ..., SemAct[x] = Customized(𝜏) -// ------------------- -// ⊢ x : 𝜏 - -// A Token action gives the span of a terminal -// ⊢ x = T, SemAct[x] = Token -// ------------------- -// ⊢ x : Span - -// Fully normalized Option must be in the following form: -// x = T_0 ... [r_0] | T_1 ... [r_1] | .. | ε -// -// Thus, the rule should be: -// -// Γ ⊢ x = T_0 ... [r_0] | T_1 ... [r_1] | .. | ε -// Γ ⊢ r_0 : 𝜏_0, r_1 : 𝜏_1 ... -// Γ ⊢ 𝜏 = 𝜏_0 = 𝜏_1 = ... -// SemAct[x] = Option -// ------------------- -// Γ ⊢ x : Option<𝜏> - -// Fully normalized ZeroOrMore must be in the following form: -// x = T_0 ... [r_0] | T_1 ... [r_1] | .. | ε -// -// Thus, the rule should be: -// -// Γ ⊢ x = T_0 ... [r_0] | T_1 ... [r_1] | .. | ε -// Γ ⊢ r_0 : 𝜏_0, r_1 : 𝜏_1 ... -// Γ ⊢ 𝜏 = 𝜏_0 = 𝜏_1 =... -// SemAct[x] = ZeroOrMore(Σ ∈ Collector<𝜏>) -// ------------------- -// Γ ⊢ x : Σ - -// Fully normalized OneOrMoreToplevel must be in the following form: -// x = T_0 ...[r_0] t | T_1 ... [r_1] t | .. -// -// Thus, the rule should be: -// -// Γ ⊢ x = T_0 ...[r_0] t | T_1 ... [r_1] t | .. -// Γ ⊢ r_0 : 𝜏_0, r_1 : 𝜏_1 ... -// Γ ⊢ 𝜏 = 𝜏_0 = 𝜏_1 = ... -// SemAct[x] = OneOrMoreToplevel -// ------------------- -// Γ ⊢ x : Σ - -// Fully normalized OneOrMoreNested must be in the following form: -// x = T_0 ... [r_0] | T_1 ... [r_1] | .. | ε -// -// Thus, the rule should be: -// -// Γ ⊢ x = T_0 ... [r_0] | T_1 ... [r_1] | .. | ε -// Γ ⊢ r_0 : 𝜏_0, r_1 : 𝜏_1 ... -// Γ ⊢ 𝜏 = 𝜏_0 = 𝜏_1 =... -// SemAct[x] = ZeroOrMore -// ------------------- -// Γ ⊢ x : () -- Notice that x accept &mut C ∈ Collector<𝜏> instead - -// Fully normalized Tuple must be in the following form: -// x = T_0 ... [r_0] x00 _x01 x02 | .. -// let η_i be the type tuple of everything including last reduce that gives an output. -// x = T_0 ... [r_0] x00 _x01 x02 | .. -// Γ ⊢ ║ η_0 ║ = ║ η_1 ║ = ... -// Γ ⊢ ∀i.∀j.∀k. η_i.k = η_j.k -// SemAct[x] = Gather -// ------------------- -// Γ ⊢ x : η - -use std::collections::HashMap; - -use syn::{parse_quote, Type}; - -use super::{semact::SemAct, BoundTarget, NormalForm, Tag}; - -#[derive(Clone)] -pub enum InferredType { - Concrete(Type), - Collector(Box), - Option(Box), - Tuple(Vec), -} - -pub struct InferenceContext<'a> { - /// Typed tags - gamma: HashMap, - /// Fully normalized terms - nforms: &'a HashMap>, -} - -impl<'a> InferenceContext<'a> { - /// Create a new inference context - pub fn new( - annotations: &'a HashMap, - nforms: &'a HashMap>, - ) -> Self { - let gamma = annotations - .iter() - .map(|(k, v)| (k.clone(), InferredType::Concrete(v.clone()))) - .collect(); - Self { gamma, nforms } - } - - fn infer_gather<'i, I: Iterator>>( - &mut self, - mut tags: I, - ) -> Option { - if let Some(tag) = tags.next() { - let mut types = vec![if let BoundTarget::Tag(tag) = tag { - self.infer(tag)? - } else { - InferredType::Concrete(parse_quote!(::pag_util::Span<'src>)) - }]; - for t in tags { - // If any inference fails, the whole inference fails - let ty = if let BoundTarget::Tag(t) = t { - self.infer(t)? - } else { - InferredType::Concrete(parse_quote!(::pag_util::Span<'src>)) - }; - types.push(ty); - } - if types.len() == 1 { - // If there is only one field, no need to wrap in a tuple - Some(types.pop().unwrap()) - } else { - // Otherwise, wrap in a tuple - Some(InferredType::Tuple(types)) - } - } else { - // no field, unit type - Some(InferredType::Concrete(parse_quote!(()))) - } - } - - /// try infer all types, but may fail with incomplete type information. - pub fn infer_all_types(mut self) -> HashMap { - let mut typed = 0; - while typed < self.nforms.len() { - typed = 0; - for i in self.nforms.keys() { - if self.infer(i).is_some() { - typed += 1; - } - } - } - self.gamma - } - - fn infer(&mut self, tag: &Tag) -> Option { - if let Some(x) = self.gamma.get(tag) { - return Some(x.clone()); - } - - // Find first subexpression that fulfills inference - let nfs = self.nforms.get(tag)?; - let mut inferred = None; - for i in nfs.iter() { - let semact = i.semact(); - match semact { - SemAct::Recognize => { - inferred.replace(InferredType::Concrete(parse_quote!(()))); - break; - } - // Token semantic action, the type is Span - SemAct::Token => { - inferred.replace(InferredType::Concrete(parse_quote!(::pag_util::Span<'src>))); - break; - } - // Customized routine without type annotation, cannot infer - SemAct::Customized(..) => continue, - // Nested routine for one or more, the type is unit. - SemAct::OneOrMoreNested => { - inferred.replace(InferredType::Concrete(parse_quote!(()))); - break; - } - SemAct::Gather => { - let visible = i.visible_bindings(0); - if let Some(gather_type) = self.infer_gather(visible.into_iter().map(|x| x.1)) { - inferred.replace(gather_type); - break; - } - } - SemAct::ZeroOrMore | SemAct::Option | SemAct::OneOrMoreToplevel => { - let mapper = |ty: InferredType| { - if matches!(semact, SemAct::Option) { - InferredType::Option(Box::new(ty)) - } else { - InferredType::Collector(Box::new(ty)) - } - }; - // Skip epsilon production, this is safe since OneOrMoreToplevel will never be empty - if let NormalForm::Empty(x, _) = i { - if x.is_empty() { - continue; - } - } - // skip the trailing part of OneOrMoreToplevel - let visible = - i.visible_bindings(if matches!(semact, SemAct::OneOrMoreToplevel) { - 1 - } else { - 0 - }); - if let Some(gather_type) = self.infer_gather(visible.into_iter().map(|x| x.1)) { - inferred.replace(mapper(gather_type)); - break; - } - } - } - } - - let target = inferred?; - self.gamma.insert(tag.clone(), target.clone()); - Some(target) - } -} diff --git a/pag-parser2/src/nf/mod.rs b/pag-parser2/src/nf/mod.rs index 46b16ae..6de59d6 100644 --- a/pag-parser2/src/nf/mod.rs +++ b/pag-parser2/src/nf/mod.rs @@ -6,21 +6,23 @@ // option. All files in the project carrying such notice may not be copied, // modified, or distributed except according to those terms. -mod inference; mod normalization; mod semact; -mod translation; +//mod translation; + +use crate::utils::Appendix; use std::{ collections::{HashMap, VecDeque}, ops::{Deref, DerefMut}, + rc::Rc, }; use quote::format_ident; use syn::Ident; #[cfg(feature = "debug")] -use crate::debug::{styled, styled_write}; +use crate::utils::{styled, styled_write}; use self::semact::SemAct; @@ -104,11 +106,34 @@ impl std::fmt::Display for Action { } } +#[derive(Clone)] +pub enum AbstractType { + /// Concrete type without any type parameter. + Concrete(Rc), + Option(Box), + Tuple(Vec), + Collector(Box), +} + #[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] pub enum NormalForm { - Empty(Vec<(Tag, Option)>, SemAct), - Unexpanded(Vec, SemAct), - Sequence(Ident, Option, Vec, SemAct), + Empty { + actions: Vec, + semact: SemAct, + ty: Appendix, + }, + Unexpanded { + actions: Vec, + semact: SemAct, + ty: Appendix, + }, + Sequence { + token: Ident, + token_output: Option, + actions: Vec, + semact: SemAct, + ty: Appendix, + }, } pub enum BoundTarget<'a> { @@ -119,84 +144,67 @@ pub enum BoundTarget<'a> { impl NormalForm { pub fn semact(&self) -> &SemAct { match self { - Self::Empty(_, semact) - | Self::Unexpanded(_, semact) - | Self::Sequence(_, _, _, semact) => semact, + Self::Empty { semact, .. } => semact, + Self::Unexpanded { semact, .. } => semact, + Self::Sequence { semact, .. } => semact, } } pub fn semact_mut(&mut self) -> &mut SemAct { match self { - Self::Empty(_, semact) - | Self::Unexpanded(_, semact) - | Self::Sequence(_, _, _, semact) => semact, + Self::Empty { semact, .. } => semact, + Self::Unexpanded { semact, .. } => semact, + Self::Sequence { semact, .. } => semact, } } - pub fn append_tailcall(&mut self) { + pub fn actions(&self) -> &[Action] { match self { - Self::Empty(_actions, _) => { - unreachable!("empty cannot be tail called, otherwise there will be ambiguity") - } - Self::Unexpanded(actions, _) => { - actions.push(Action::TailCall); - } - Self::Sequence(_, _, actions, _) => { - actions.push(Action::TailCall); - } + Self::Empty { actions, .. } => actions, + Self::Unexpanded { actions, .. } => actions, + Self::Sequence { actions, .. } => actions, } } - pub fn append_pass_collector(&mut self, tag: Tag) { + pub fn actions_mut(&mut self) -> &mut Vec { match self { - Self::Empty(_actions, _) => { - unreachable!("empty cannot be followed by another subroutine, otherwise there will be ambiguity") - } - Self::Unexpanded(actions, _) => { - actions.push(Action::PassCollector(tag)); - } - Self::Sequence(_, _, actions, _) => { - actions.push(Action::PassCollector(tag)); - } + Self::Empty { actions, .. } => actions, + Self::Unexpanded { actions, .. } => actions, + Self::Sequence { actions, .. } => actions, } } - pub fn visible_bindings(&self, skip: usize) -> Vec<(&Ident, BoundTarget)> { - match self { - Self::Empty(actions, _) => actions - .last() - .and_then(|(tag, ident)| Some((ident.as_ref()?, BoundTarget::Tag(tag)))) - .into_iter() - .collect(), - Self::Unexpanded(actions, _) | Self::Sequence(_, _, actions, _) => { - let mut acc = VecDeque::new(); - for act in actions.iter().rev().skip(skip) { - match act { - Action::Shift { tag, output } => { - if let Some(ident) = output { - acc.push_front((ident, BoundTarget::Tag(tag))); - } - } - Action::Reduce { tag, output } => { - if let Some(ident) = output { - acc.push_front((ident, BoundTarget::Tag(tag))); - } - break; - } - Action::PassCollector(..) => continue, - Action::TailCall => continue, + pub fn visible_bindings(&self, skip: usize) -> Box<[(&Ident, BoundTarget)]> { + let mut acc = VecDeque::new(); + for act in self.actions().iter().rev().skip(skip) { + match act { + Action::Shift { tag, output } => { + if let Some(ident) = output { + acc.push_front((ident, BoundTarget::Tag(tag))); } } - if let Self::Sequence(_, Some(tk), _, _) = self { - if acc.len() == actions.len() - skip - && !matches!(actions.first(), Some(Action::Reduce { .. })) - { - acc.push_front((tk, BoundTarget::Token)); + Action::Reduce { tag, output } => { + if let Some(ident) = output { + acc.push_front((ident, BoundTarget::Tag(tag))); } + break; } - acc.into_iter().collect() + Action::PassCollector(..) => continue, + Action::TailCall => continue, + } + } + if let Self::Sequence { + token_output: Some(tk), + .. + } = self + { + if acc.len() == self.actions().len() - skip + && !matches!(self.actions().first(), Some(Action::Reduce { .. })) + { + acc.push_front((tk, BoundTarget::Token)); } } + acc.into_iter().collect() } } @@ -204,27 +212,28 @@ impl NormalForm { impl std::fmt::Display for NormalForm { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { - Self::Empty(actions, _) => { + Self::Empty { actions, .. } => { write!(f, "ε")?; - for (tag, output) in actions.iter() { - if let Some(name) = output { - styled_write!(f, Color::Blue, "\t{tag}[{name}]")?; - } else { - styled_write!(f, Color::Blue, "\t{tag}")?; - } + for action in actions { + write!(f, "\t{}", action)?; } } - Self::Unexpanded(actions, _) => { + Self::Unexpanded { actions, .. } => { write!(f, "{}", actions[0])?; for action in &actions[1..] { write!(f, "\t{}", action)?; } } - Self::Sequence(terminal, var, actions, _) => { - if let Some(tk) = var { - styled_write!(f, Color::Yellow, "{terminal}[{tk}]")?; + Self::Sequence { + token, + token_output, + actions, + .. + } => { + if let Some(tk) = token_output { + styled_write!(f, Color::Yellow, "{token}[{tk}]")?; } else { - styled_write!(f, Color::Yellow, "{terminal}")?; + styled_write!(f, Color::Yellow, "{token}")?; } for action in actions.iter() { write!(f, "\t{}", action)?; @@ -239,10 +248,10 @@ impl std::fmt::Display for NormalForm { #[test] fn debug_print_test() { use quote::format_ident; - let sequence = NormalForm::Sequence( - format_ident!("TEST"), - Some(format_ident!("x")), - vec![ + let sequence = NormalForm::Sequence { + token: format_ident!("TEST"), + token_output: Some(format_ident!("x")), + actions: vec![ Action::Shift { tag: Tag::Toplevel(format_ident!("a")), output: None, @@ -260,8 +269,9 @@ fn debug_print_test() { output: None, }, ], - SemAct::Gather, - ); + semact: SemAct::Gather, + ty: Appendix(AbstractType::Concrete(Rc::new(syn::parse_quote!(u32)))), + }; println!("{}", sequence); } @@ -316,10 +326,10 @@ impl std::fmt::Display for NFTable { #[test] fn debug_print_nf_table() { use quote::format_ident; - let sequence = NormalForm::Sequence( - format_ident!("TEST"), - Some(format_ident!("x")), - vec![ + let sequence = NormalForm::Sequence { + token: format_ident!("TEST"), + token_output: Some(format_ident!("x")), + actions: vec![ Action::Shift { tag: Tag::Toplevel(format_ident!("a")), output: None, @@ -337,15 +347,23 @@ fn debug_print_nf_table() { output: None, }, ], - SemAct::Gather, - ); - let empty = NormalForm::Empty( - vec![ - (Tag::Toplevel(format_ident!("a")), None), - (Tag::Toplevel(format_ident!("b")), Some(format_ident!("x"))), + semact: SemAct::Gather, + ty: Appendix(AbstractType::Concrete(Rc::new(syn::parse_quote!(u32)))), + }; + let empty = NormalForm::Empty { + actions: vec![ + Action::Reduce { + tag: Tag::Toplevel(format_ident!("b")), + output: Some(format_ident!("x")), + }, + Action::Reduce { + tag: Tag::Toplevel(format_ident!("c")), + output: Some(format_ident!("y")), + }, ], - SemAct::Gather, - ); + semact: SemAct::Gather, + ty: Appendix(AbstractType::Concrete(Rc::new(syn::parse_quote!(u32)))), + }; let table = NFTable( vec![ ( diff --git a/pag-parser2/src/nf/translation.rs b/pag-parser2/src/nf/translation.rs index e12adb2..a871361 100644 --- a/pag-parser2/src/nf/translation.rs +++ b/pag-parser2/src/nf/translation.rs @@ -9,7 +9,6 @@ use syn::{Ident, Type}; use super::NFTable; use super::{semact::SemAct, Action, NormalForm, Tag}; -use crate::frontend::RightDeepIterator; use crate::frontend::{ParserDef, ParserExpr}; #[derive(Default)] diff --git a/pag-parser2/src/utils.rs b/pag-parser2/src/utils.rs new file mode 100644 index 0000000..01a9511 --- /dev/null +++ b/pag-parser2/src/utils.rs @@ -0,0 +1,70 @@ +use std::ops::{Deref, DerefMut}; + +#[cfg(feature = "ansi-debug")] +macro_rules! styled { + ($style:expr, $($arg:tt)*) => { + { + use nu_ansi_term::*; + $style.paint(format!($($arg)*)) + } + }; +} +#[cfg(not(feature = "ansi-debug"))] +macro_rules! styled { + ($style:expr, $($arg:tt)*) => {format!($($arg)*)}; +} + +#[cfg(feature = "debug")] +macro_rules! styled_write { + ($dst:expr, $($arg:tt)*) => { + write!($dst, "{}", $crate::utils::styled!($($arg)*)) + }; +} + +#[cfg(feature = "debug")] +pub(crate) use styled; + +#[cfg(feature = "debug")] +pub(crate) use styled_write; + +/// Appendix that does not count in equality/ordinality/hashing. +#[derive(Clone)] +pub struct Appendix(pub T); + +impl Deref for Appendix { + type Target = T; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl DerefMut for Appendix { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + +impl PartialEq for Appendix { + fn eq(&self, _other: &Self) -> bool { + true + } +} + +impl Eq for Appendix {} + +impl PartialOrd for Appendix { + fn partial_cmp(&self, _other: &Self) -> Option { + Some(std::cmp::Ordering::Equal) + } +} + +impl Ord for Appendix { + fn cmp(&self, _other: &Self) -> std::cmp::Ordering { + std::cmp::Ordering::Equal + } +} + +impl std::hash::Hash for Appendix { + fn hash(&self, _state: &mut H) {} +} From 4865e766c4f5d3bda0ac4b8f7aed6ac21869a375 Mon Sep 17 00:00:00 2001 From: Schrodinger ZHU Yifan Date: Sun, 30 Jul 2023 18:48:27 -0400 Subject: [PATCH 39/42] translation with type info --- pag-parser2/src/frontend/ast.rs | 31 +++ pag-parser2/src/lib.rs | 2 +- pag-parser2/src/nf/mod.rs | 44 ++++- pag-parser2/src/nf/translation.rs | 308 +++++++++++++++++++++++------- pag-parser2/src/utils.rs | 6 + 5 files changed, 313 insertions(+), 78 deletions(-) diff --git a/pag-parser2/src/frontend/ast.rs b/pag-parser2/src/frontend/ast.rs index 00ef1b5..98bc33d 100644 --- a/pag-parser2/src/frontend/ast.rs +++ b/pag-parser2/src/frontend/ast.rs @@ -90,3 +90,34 @@ pub enum ParserExpr { ParserRef(syn::Ident), Ignore(Box), } + +pub enum SequenceIterator<'a> { + End, + Singleton(&'a ParserExpr), + Multiple(std::slice::Iter<'a, ParserExpr>), +} + +impl<'a> From<&'a ParserExpr> for SequenceIterator<'a> { + fn from(value: &'a ParserExpr) -> Self { + match value { + ParserExpr::Seq(inner) => Self::Multiple(inner.iter()), + _ => Self::Singleton(value), + } + } +} + +impl<'a> Iterator for SequenceIterator<'a> { + type Item = &'a ParserExpr; + + fn next(&mut self) -> Option { + match self { + SequenceIterator::End => None, + SequenceIterator::Singleton(result) => { + let result = *result; + *self = Self::End; + Some(result) + } + SequenceIterator::Multiple(ref mut iter) => iter.next(), + } + } +} diff --git a/pag-parser2/src/lib.rs b/pag-parser2/src/lib.rs index 781a7a8..0394c10 100644 --- a/pag-parser2/src/lib.rs +++ b/pag-parser2/src/lib.rs @@ -5,7 +5,7 @@ // license , at your // option. All files in the project carrying such notice may not be copied, // modified, or distributed except according to those terms. - +#![feature(box_patterns)] mod frontend; mod nf; mod utils; diff --git a/pag-parser2/src/nf/mod.rs b/pag-parser2/src/nf/mod.rs index 6de59d6..d6d49f7 100644 --- a/pag-parser2/src/nf/mod.rs +++ b/pag-parser2/src/nf/mod.rs @@ -8,7 +8,7 @@ mod normalization; mod semact; -//mod translation; +mod translation; use crate::utils::Appendix; @@ -115,6 +115,20 @@ pub enum AbstractType { Collector(Box), } +thread_local! { + static UNIT_TYPE: AbstractType = AbstractType::Concrete(Rc::new(syn::parse_quote!(()))); + static SPAN_TYPE: AbstractType = AbstractType::Concrete(Rc::new(syn::parse_quote!(::pag_util::Span<'src>))); +} + +impl AbstractType { + pub fn unit_type() -> Self { + UNIT_TYPE.with(Self::clone) + } + pub fn span_type() -> Self { + SPAN_TYPE.with(Self::clone) + } +} + #[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] pub enum NormalForm { Empty { @@ -174,6 +188,22 @@ impl NormalForm { } } + pub fn ty(&self) -> &Appendix { + match self { + Self::Empty { ty, .. } => ty, + Self::Unexpanded { ty, .. } => ty, + Self::Sequence { ty, .. } => ty, + } + } + + pub fn ty_mut(&mut self) -> &mut Appendix { + match self { + Self::Empty { ty, .. } => ty, + Self::Unexpanded { ty, .. } => ty, + Self::Sequence { ty, .. } => ty, + } + } + pub fn visible_bindings(&self, skip: usize) -> Box<[(&Ident, BoundTarget)]> { let mut acc = VecDeque::new(); for act in self.actions().iter().rev().skip(skip) { @@ -212,22 +242,29 @@ impl NormalForm { impl std::fmt::Display for NormalForm { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { - Self::Empty { actions, .. } => { + Self::Empty { + actions, semact, .. + } => { write!(f, "ε")?; for action in actions { write!(f, "\t{}", action)?; } + write!(f, "\t{{{}}}", semact)?; } - Self::Unexpanded { actions, .. } => { + Self::Unexpanded { + actions, semact, .. + } => { write!(f, "{}", actions[0])?; for action in &actions[1..] { write!(f, "\t{}", action)?; } + write!(f, "\t{{{}}}", semact)?; } Self::Sequence { token, token_output, actions, + semact, .. } => { if let Some(tk) = token_output { @@ -238,6 +275,7 @@ impl std::fmt::Display for NormalForm { for action in actions.iter() { write!(f, "\t{}", action)?; } + write!(f, "\t{{{}}}", semact)?; } } Ok(()) diff --git a/pag-parser2/src/nf/translation.rs b/pag-parser2/src/nf/translation.rs index a871361..469e90a 100644 --- a/pag-parser2/src/nf/translation.rs +++ b/pag-parser2/src/nf/translation.rs @@ -3,22 +3,22 @@ //! use std::collections::HashMap; +use std::rc::Rc; use quote::format_ident; use syn::{Ident, Type}; -use super::NFTable; use super::{semact::SemAct, Action, NormalForm, Tag}; -use crate::frontend::{ParserDef, ParserExpr}; +use super::{AbstractType, NFTable}; +use crate::frontend::{Ast, ParserDef, ParserExpr, SequenceIterator}; -#[derive(Default)] struct Translation { /// Table of semi-normalized production rules semi_nfs: NFTable, /// Toplevel type annotations - annotations: HashMap, + annotations: HashMap>, /// Type hints when calling inner routines (collector) - hints: HashMap, + hints: HashMap>, /// Counter of assigned non-explicit variable names output_cnt: usize, /// Counter of assigned anonymous routines @@ -27,6 +27,30 @@ struct Translation { ignoring_cnt: usize, } +type NFAttrTuple = (Tag, Option, AbstractType); + +impl From<&'_ Ast> for Translation { + fn from(value: &Ast) -> Self { + let annotations = value + .parser_map + .iter() + .map(|(ident, def)| (Tag::Toplevel(ident.clone()), def.ty.clone())) + .collect(); + let mut translation = Self { + semi_nfs: Default::default(), + annotations, + hints: Default::default(), + output_cnt: 0, + anonymous_cnt: 0, + ignoring_cnt: 0, + }; + for (i, def) in &value.parser_map { + translation.add_toplevel_def(i.clone(), def); + } + translation + } +} + impl Translation { /// Enter ignoring mode fn start_ignoring(&mut self) { @@ -56,50 +80,130 @@ impl Translation { result } - /// Construct a normal form from a sequence of parser expressions. The semact is always `Recognize`. - fn partial_nf_from_sequence< + fn infer_type>( + &self, + mut inner_types: I, + semact: &SemAct, + tag: &Tag, + ) -> AbstractType { + match semact { + SemAct::Customized(_) => AbstractType::Concrete(self.annotations[tag].clone()), + SemAct::Gather => match inner_types.len() { + 0 => AbstractType::unit_type(), + 1 => inner_types.next().unwrap(), + _ => AbstractType::Tuple(inner_types.collect()), + }, + SemAct::Option => match inner_types.len() { + 0 => AbstractType::Option(Box::new(AbstractType::unit_type())), + 1 => AbstractType::Option(Box::new(inner_types.next().unwrap())), + _ => AbstractType::Option(Box::new(AbstractType::Tuple(inner_types.collect()))), + }, + SemAct::ZeroOrMore => match inner_types.len() { + 0 => AbstractType::Collector(Box::new(AbstractType::unit_type())), + 1 => AbstractType::Collector(Box::new(inner_types.next().unwrap())), + _ => AbstractType::Collector(Box::new(AbstractType::Tuple(inner_types.collect()))), + }, + SemAct::OneOrMoreToplevel => match inner_types.len() { + 0 => AbstractType::Collector(Box::new(AbstractType::unit_type())), + 1 => AbstractType::Collector(Box::new(inner_types.next().unwrap())), + _ => AbstractType::Collector(Box::new(AbstractType::Tuple(inner_types.collect()))), + }, + SemAct::OneOrMoreNested => AbstractType::unit_type(), + SemAct::Token => AbstractType::span_type(), + SemAct::Recognize => AbstractType::unit_type(), + } + } + + /// Construct a normal form from a sequence of parser expressions. + fn create_nf_from_sequence< 'a, const IGNORE_UNNAMED: bool, I: Iterator)>, >( &mut self, mut iter: I, + semact: SemAct, + tag: &Tag, ) -> NormalForm { + debug_assert_eq!( + self.ignoring(), + matches!(semact, SemAct::Recognize), + "semact must be Recognize in ignoring mode" + ); match iter.next() { - None => NormalForm::Empty(vec![], SemAct::Recognize), + None => NormalForm::Empty { + actions: vec![], + semact, + ty: AbstractType::unit_type().into(), + }, Some((ParserExpr::Ignore(box ParserExpr::LexerRef(token)), _)) => { - let tail = iter + let mut types = Vec::new(); + let actions = iter .map(|(inner, named)| self.add_anonymous_rule::(inner, named)) - .map(|(tag, output)| Action::Shift { tag, output }) + .map(|(tag, output, ty)| { + if output.is_some() { + types.push(ty); + } + Action::Shift { tag, output } + }) .collect(); - NormalForm::Sequence(token.clone(), None, tail, SemAct::Recognize) + let ty = self.infer_type(types.into_iter(), &semact, tag).into(); + NormalForm::Sequence { + token: token.clone(), + token_output: None, + actions, + semact, + ty, + } } Some((ParserExpr::LexerRef(token), named)) => { - let tail = iter + let mut types = Vec::new(); + if named.is_some() { + types.push(AbstractType::span_type()) + } + let actions = iter .map(|(inner, named)| self.add_anonymous_rule::(inner, named)) - .map(|(tag, output)| Action::Shift { tag, output }) + .map(|(tag, output, ty)| { + if output.is_some() { + types.push(ty); + } + Action::Shift { tag, output } + }) .collect(); - NormalForm::Sequence( - token.clone(), - if self.ignoring() { + let ty = self.infer_type(types.into_iter(), &semact, tag).into(); + NormalForm::Sequence { + token: token.clone(), + token_output: if matches!(semact, SemAct::Recognize) { None } else if IGNORE_UNNAMED { named } else { named.or_else(|| Some(self.new_output_sym())) }, - tail, - SemAct::Recognize, - ) + actions, + semact, + ty, + } } Some((expr, named)) => { - let sequence = [(expr, named)] + let mut types = Vec::new(); + let actions = [(expr, named)] .into_iter() .chain(iter) .map(|(inner, named)| self.add_anonymous_rule::(inner, named)) - .map(|(tag, output)| Action::Shift { tag, output }) + .map(|(tag, output, ty)| { + if output.is_some() { + types.push(ty); + } + Action::Shift { tag, output } + }) .collect(); - NormalForm::Unexpanded(sequence, SemAct::Recognize) + let ty = self.infer_type(types.into_iter(), &semact, tag).into(); + NormalForm::Unexpanded { + actions, + semact, + ty, + } } } } @@ -108,50 +212,83 @@ impl Translation { self.semi_nfs.entry(tag).or_default().push(nf); } - fn add_nf_from_anonymous_expr(&mut self, expr: &ParserExpr, tag: &Tag) { + fn add_nf_from_anonymous_expr(&mut self, expr: &ParserExpr, tag: &Tag) -> AbstractType { match expr { - ParserExpr::Seq(..) => { - let mut partial_nf = self.partial_nf_from_sequence::( - RightDeepIterator::from(expr).map(|expr| (expr, None)), - ); - *partial_nf.semact_mut() = if self.ignoring() { + ParserExpr::Seq(exprs) => { + let semact = if self.ignoring() { SemAct::Recognize } else { SemAct::Gather }; + let partial_nf = self.create_nf_from_sequence::( + exprs.iter().map(|expr| (expr, None)), + semact, + tag, + ); + let ty = partial_nf.ty().0.clone(); self.add_nf(tag.clone(), partial_nf); + ty } ParserExpr::Opt(inner) => { - let mut partial_nf = self.partial_nf_from_sequence::( - RightDeepIterator::from(inner.as_ref()).map(|expr| (expr, None)), - ); let semact = if self.ignoring() { SemAct::Recognize } else { SemAct::Option }; + let mut partial_nf = self.create_nf_from_sequence::( + SequenceIterator::from(inner.as_ref()).map(|expr| (expr, None)), + semact.clone(), + tag, + ); + let ty = partial_nf.ty().clone(); *partial_nf.semact_mut() = semact.clone(); self.add_nf(tag.clone(), partial_nf); // add one more rule for empty - self.add_nf(tag.clone(), NormalForm::Empty(vec![], semact)); + self.add_nf( + tag.clone(), + NormalForm::Empty { + actions: vec![], + semact, + ty: ty.clone(), + }, + ); + ty.0 } ParserExpr::Star(inner) => { - let mut partial_nf = self.partial_nf_from_sequence::( - RightDeepIterator::from(inner.as_ref()).map(|expr| (expr, None)), - ); let semact = if self.ignoring() { - SemAct::Recognize - } else { SemAct::ZeroOrMore + } else { + SemAct::Option }; + let mut partial_nf = self.create_nf_from_sequence::( + SequenceIterator::from(inner.as_ref()).map(|expr| (expr, None)), + semact.clone(), + tag, + ); + let ty = partial_nf.ty().clone(); *partial_nf.semact_mut() = semact.clone(); self.add_nf(tag.clone(), partial_nf); // add one more rule for empty - self.add_nf(tag.clone(), NormalForm::Empty(vec![], semact)); + self.add_nf( + tag.clone(), + NormalForm::Empty { + actions: vec![], + semact, + ty: ty.clone(), + }, + ); + ty.0 } ParserExpr::Plus(inner) => { - let mut partial_nf = self.partial_nf_from_sequence::( - RightDeepIterator::from(inner.as_ref()).map(|expr| (expr, None)), + let semact = if self.ignoring() { + SemAct::Recognize + } else { + SemAct::OneOrMoreToplevel + }; + let mut partial_nf = self.create_nf_from_sequence::( + SequenceIterator::from(inner.as_ref()).map(|expr| (expr, None)), + semact.clone(), + tag, ); let nested_tag = self.new_anonymous_tag(); // the nested routine @@ -164,37 +301,51 @@ impl Translation { self.add_nf(nested_tag.clone(), { let mut nf = partial_nf.clone(); - nf.append_tailcall(); + nf.actions_mut().push(Action::TailCall); *nf.semact_mut() = semact.clone(); nf }); - self.add_nf(nested_tag.clone(), NormalForm::Empty(vec![], semact)); + self.add_nf( + nested_tag.clone(), + NormalForm::Empty { + actions: vec![], + semact, + ty: AbstractType::unit_type().into(), + }, + ); } // the toplevel routine { - let semact = if self.ignoring() { - SemAct::Recognize - } else { - SemAct::OneOrMoreToplevel - }; - partial_nf.append_pass_collector(nested_tag); - *partial_nf.semact_mut() = semact; + partial_nf + .actions_mut() + .push(Action::PassCollector(tag.clone())); + let ty = partial_nf.ty().0.clone(); self.add_nf(tag.clone(), partial_nf); + ty } } ParserExpr::LexerRef(ident) => { let nf = if self.ignoring() { - NormalForm::Sequence(ident.clone(), None, vec![], SemAct::Recognize) + NormalForm::Sequence { + token: ident.clone(), + token_output: None, + actions: vec![], + semact: SemAct::Recognize, + ty: AbstractType::unit_type().into(), + } } else { - NormalForm::Sequence( - ident.clone(), - Some(self.new_output_sym()), - vec![], - SemAct::Token, - ) + NormalForm::Sequence { + token: ident.clone(), + token_output: Some(self.new_output_sym()), + actions: vec![], + semact: SemAct::Token, + ty: AbstractType::span_type().into(), + } }; + let ty = nf.ty().0.clone(); self.add_nf(tag.clone(), nf); + ty } ParserExpr::ParserRef(_) => unreachable!("cannot create nf from parser ref"), ParserExpr::Ignore(_) => unreachable!("cannot create nf from ignore"), @@ -205,7 +356,7 @@ impl Translation { &mut self, expr: &ParserExpr, named: Option, - ) -> (Tag, Option) { + ) -> NFAttrTuple { let is_unnamed = named.is_none(); if IGNORE_UNNAMED && is_unnamed { self.start_ignoring(); @@ -214,24 +365,30 @@ impl Translation { ParserExpr::ParserRef(x) => { let tag = Tag::Toplevel(x.clone()); if self.ignoring() { - (tag, None) + (tag, None, AbstractType::unit_type()) } else { - (tag, named.or_else(|| Some(self.new_output_sym()))) + let ty = self + .annotations + .get(&tag) + .map(Rc::clone) + .map(AbstractType::Concrete) + .expect("toplevel rule must be typed"); + (tag, named.or_else(|| Some(self.new_output_sym())), ty) } } ParserExpr::Ignore(expr) => { self.start_ignoring(); - let (tag, output) = self.add_anonymous_rule::(expr, named); + let result = self.add_anonymous_rule::(expr, named); self.end_ignoring(); - (tag, output) + result } _ => { let tag = self.new_anonymous_tag(); - self.add_nf_from_anonymous_expr(expr, &tag); + let ty = self.add_nf_from_anonymous_expr(expr, &tag); if self.ignoring() { - (tag, None) + (tag, None, AbstractType::unit_type()) } else { - (tag, named.or_else(|| Some(self.new_output_sym()))) + (tag, named.or_else(|| Some(self.new_output_sym())), ty) } } }; @@ -244,7 +401,6 @@ impl Translation { // Translate a top-level definition fn add_toplevel_def(&mut self, name: Ident, def: &ParserDef) { let tag = Tag::Toplevel(name); - self.annotations.insert(tag.clone(), def.ty.clone()); let rules = def .rules .iter() @@ -256,20 +412,23 @@ impl Translation { } else { SemAct::Gather }; - let mut partial_nf = if matches!(semact, SemAct::Customized(..)) { - self.partial_nf_from_sequence::( + let partial_nf = if matches!(semact, SemAct::Customized(..)) { + self.create_nf_from_sequence::( rule.vars .iter() .map(|binding| (&binding.expr, binding.name.clone())), + semact, + &tag, ) } else { - self.partial_nf_from_sequence::( + self.create_nf_from_sequence::( rule.vars .iter() .map(|binding| (&binding.expr, binding.name.clone())), + semact, + &tag, ) }; - *partial_nf.semact_mut() = semact; partial_nf }) .collect(); @@ -303,11 +462,12 @@ mod test { "#, ) .unwrap(); - let mut translation = Translation::default(); - for (name, def) in ast.parser_map.iter() { - translation.add_toplevel_def(name.clone(), def); - } #[cfg(feature = "debug")] - println!("{}", translation.semi_nfs); + { + let translation = Translation::from(&ast); + println!("{}", translation.semi_nfs); + } + #[cfg(not(feature = "debug"))] + let _ = Translation::from(&ast); } } diff --git a/pag-parser2/src/utils.rs b/pag-parser2/src/utils.rs index 01a9511..32ee6dc 100644 --- a/pag-parser2/src/utils.rs +++ b/pag-parser2/src/utils.rs @@ -31,6 +31,12 @@ pub(crate) use styled_write; #[derive(Clone)] pub struct Appendix(pub T); +impl From for Appendix { + fn from(x: T) -> Self { + Self(x) + } +} + impl Deref for Appendix { type Target = T; From e6422995703ab45a2aa524a7736b5cb492a88319 Mon Sep 17 00:00:00 2001 From: Schrodinger ZHU Yifan Date: Sun, 30 Jul 2023 22:47:13 -0400 Subject: [PATCH 40/42] process type hints --- pag-lexer/src/lookahead.rs | 6 ++-- pag-parser2/src/nf/translation.rs | 58 +++++++++++++++++++------------ 2 files changed, 39 insertions(+), 25 deletions(-) diff --git a/pag-lexer/src/lookahead.rs b/pag-lexer/src/lookahead.rs index 1db0c39..b66862c 100644 --- a/pag-lexer/src/lookahead.rs +++ b/pag-lexer/src/lookahead.rs @@ -199,7 +199,9 @@ mod test { fn test_lookahead_codegen() { use crate::intervals; let positives = intervals!((b'0', b'9'), (b'0', b'9'), (b'A', b'F')); - syn::parse2::(generate_lookahead_routine(&positives, Kind::Positive)).unwrap(); - syn::parse2::(generate_lookahead_routine(&positives, Kind::Negative)).unwrap(); + let positive = generate_lookahead_routine(&positives, Kind::Positive); + let _: syn::Expr = syn::parse_quote! { { #positive } }; + let negative = generate_lookahead_routine(&positives, Kind::Negative); + let _: syn::Expr = syn::parse_quote! { { #negative } }; } } diff --git a/pag-parser2/src/nf/translation.rs b/pag-parser2/src/nf/translation.rs index 469e90a..587ce03 100644 --- a/pag-parser2/src/nf/translation.rs +++ b/pag-parser2/src/nf/translation.rs @@ -118,7 +118,7 @@ impl Translation { fn create_nf_from_sequence< 'a, const IGNORE_UNNAMED: bool, - I: Iterator)>, + I: Iterator, Option>)>, >( &mut self, mut iter: I, @@ -136,11 +136,15 @@ impl Translation { semact, ty: AbstractType::unit_type().into(), }, - Some((ParserExpr::Ignore(box ParserExpr::LexerRef(token)), _)) => { + Some((ParserExpr::Ignore(box ParserExpr::LexerRef(token)), _, _)) => { let mut types = Vec::new(); let actions = iter - .map(|(inner, named)| self.add_anonymous_rule::(inner, named)) - .map(|(tag, output, ty)| { + .map(|(inner, named, hint)| { + let (tag, output, ty) = + self.add_anonymous_rule::(inner, named); + if let Some(x) = hint { + self.hints.insert(tag.clone(), x); + } if output.is_some() { types.push(ty); } @@ -156,14 +160,18 @@ impl Translation { ty, } } - Some((ParserExpr::LexerRef(token), named)) => { + Some((ParserExpr::LexerRef(token), named, _)) => { let mut types = Vec::new(); if named.is_some() { types.push(AbstractType::span_type()) } let actions = iter - .map(|(inner, named)| self.add_anonymous_rule::(inner, named)) - .map(|(tag, output, ty)| { + .map(|(inner, named, hint)| { + let (tag, output, ty) = + self.add_anonymous_rule::(inner, named); + if let Some(x) = hint { + self.hints.insert(tag.clone(), x); + } if output.is_some() { types.push(ty); } @@ -185,13 +193,17 @@ impl Translation { ty, } } - Some((expr, named)) => { + Some((expr, named, hint)) => { let mut types = Vec::new(); - let actions = [(expr, named)] + let actions = [(expr, named, hint)] .into_iter() .chain(iter) - .map(|(inner, named)| self.add_anonymous_rule::(inner, named)) - .map(|(tag, output, ty)| { + .map(|(inner, named, hint)| { + let (tag, output, ty) = + self.add_anonymous_rule::(inner, named); + if let Some(x) = hint { + self.hints.insert(tag.clone(), x); + } if output.is_some() { types.push(ty); } @@ -221,7 +233,7 @@ impl Translation { SemAct::Gather }; let partial_nf = self.create_nf_from_sequence::( - exprs.iter().map(|expr| (expr, None)), + exprs.iter().map(|expr| (expr, None, None)), semact, tag, ); @@ -236,7 +248,7 @@ impl Translation { SemAct::Option }; let mut partial_nf = self.create_nf_from_sequence::( - SequenceIterator::from(inner.as_ref()).map(|expr| (expr, None)), + SequenceIterator::from(inner.as_ref()).map(|expr| (expr, None, None)), semact.clone(), tag, ); @@ -261,7 +273,7 @@ impl Translation { SemAct::Option }; let mut partial_nf = self.create_nf_from_sequence::( - SequenceIterator::from(inner.as_ref()).map(|expr| (expr, None)), + SequenceIterator::from(inner.as_ref()).map(|expr| (expr, None, None)), semact.clone(), tag, ); @@ -286,8 +298,8 @@ impl Translation { SemAct::OneOrMoreToplevel }; let mut partial_nf = self.create_nf_from_sequence::( - SequenceIterator::from(inner.as_ref()).map(|expr| (expr, None)), - semact.clone(), + SequenceIterator::from(inner.as_ref()).map(|expr| (expr, None, None)), + semact, tag, ); let nested_tag = self.new_anonymous_tag(); @@ -307,7 +319,7 @@ impl Translation { }); self.add_nf( - nested_tag.clone(), + nested_tag, NormalForm::Empty { actions: vec![], semact, @@ -414,17 +426,17 @@ impl Translation { }; let partial_nf = if matches!(semact, SemAct::Customized(..)) { self.create_nf_from_sequence::( - rule.vars - .iter() - .map(|binding| (&binding.expr, binding.name.clone())), + rule.vars.iter().map(|binding| { + (&binding.expr, binding.name.clone(), binding.ty.clone()) + }), semact, &tag, ) } else { self.create_nf_from_sequence::( - rule.vars - .iter() - .map(|binding| (&binding.expr, binding.name.clone())), + rule.vars.iter().map(|binding| { + (&binding.expr, binding.name.clone(), binding.ty.clone()) + }), semact, &tag, ) From aa1db2c7f4a518f526dc4548870a3031766e32d5 Mon Sep 17 00:00:00 2001 From: Schrodinger ZHU Yifan Date: Mon, 31 Jul 2023 10:17:27 -0400 Subject: [PATCH 41/42] fix wrong semact --- pag-parser2/src/nf/translation.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pag-parser2/src/nf/translation.rs b/pag-parser2/src/nf/translation.rs index 587ce03..4f81cc0 100644 --- a/pag-parser2/src/nf/translation.rs +++ b/pag-parser2/src/nf/translation.rs @@ -268,9 +268,9 @@ impl Translation { } ParserExpr::Star(inner) => { let semact = if self.ignoring() { - SemAct::ZeroOrMore + SemAct::Recognize } else { - SemAct::Option + SemAct::ZeroOrMore }; let mut partial_nf = self.create_nf_from_sequence::( SequenceIterator::from(inner.as_ref()).map(|expr| (expr, None, None)), From 0900a4439b1750dd1e036a51a1255c91dc9f3b48 Mon Sep 17 00:00:00 2001 From: Schrodinger ZHU Yifan Date: Mon, 31 Jul 2023 12:07:45 -0400 Subject: [PATCH 42/42] implement normaliation --- pag-parser2/src/nf/mod.rs | 84 +++++------------ pag-parser2/src/nf/normalization.rs | 135 ++++++++++++++++++++++++++++ pag-parser2/src/nf/translation.rs | 28 +++--- pag-parser2/src/utils.rs | 1 + 4 files changed, 171 insertions(+), 77 deletions(-) diff --git a/pag-parser2/src/nf/mod.rs b/pag-parser2/src/nf/mod.rs index d6d49f7..fb29a25 100644 --- a/pag-parser2/src/nf/mod.rs +++ b/pag-parser2/src/nf/mod.rs @@ -5,21 +5,20 @@ // license , at your // option. All files in the project carrying such notice may not be copied, // modified, or distributed except according to those terms. - -mod normalization; mod semact; mod translation; +mod normalization; use crate::utils::Appendix; use std::{ - collections::{HashMap, VecDeque}, + collections::HashMap, ops::{Deref, DerefMut}, rc::Rc, }; use quote::format_ident; -use syn::Ident; +use syn::{Ident, Type}; #[cfg(feature = "debug")] use crate::utils::{styled, styled_write}; @@ -64,7 +63,7 @@ impl std::fmt::Display for Tag { /// reducing from left to right, we maintain the context of which the current /// semantic action to reduce, and always assign "__0", "__1", "__2". When a [`Reduce`] is /// encountered, we start over from "__0". -#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] pub enum Action { Shift { /// Parser routine to call. @@ -73,7 +72,8 @@ pub enum Action { }, Reduce { /// Reduction routine to call. - tag: Tag, + semact: SemAct, + hint: Option>>, output: Option, }, /// Specialized action for tail call optimization. @@ -86,11 +86,11 @@ pub enum Action { impl std::fmt::Display for Action { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { - Self::Reduce { tag, output } => { + Self::Reduce { semact, output, .. } => { if let Some(name) = output { - styled_write!(f, Color::Blue, "{tag}[{name}]") + styled_write!(f, Color::Blue, "{semact}[{name}]") } else { - styled_write!(f, Color::Blue, "{tag}") + styled_write!(f, Color::Blue, "{semact}") } } Self::Shift { tag, output } => { @@ -143,18 +143,12 @@ pub enum NormalForm { }, Sequence { token: Ident, - token_output: Option, actions: Vec, semact: SemAct, ty: Appendix, }, } -pub enum BoundTarget<'a> { - Tag(&'a Tag), - Token, -} - impl NormalForm { pub fn semact(&self) -> &SemAct { match self { @@ -203,39 +197,6 @@ impl NormalForm { Self::Sequence { ty, .. } => ty, } } - - pub fn visible_bindings(&self, skip: usize) -> Box<[(&Ident, BoundTarget)]> { - let mut acc = VecDeque::new(); - for act in self.actions().iter().rev().skip(skip) { - match act { - Action::Shift { tag, output } => { - if let Some(ident) = output { - acc.push_front((ident, BoundTarget::Tag(tag))); - } - } - Action::Reduce { tag, output } => { - if let Some(ident) = output { - acc.push_front((ident, BoundTarget::Tag(tag))); - } - break; - } - Action::PassCollector(..) => continue, - Action::TailCall => continue, - } - } - if let Self::Sequence { - token_output: Some(tk), - .. - } = self - { - if acc.len() == self.actions().len() - skip - && !matches!(self.actions().first(), Some(Action::Reduce { .. })) - { - acc.push_front((tk, BoundTarget::Token)); - } - } - acc.into_iter().collect() - } } #[cfg(feature = "debug")] @@ -262,16 +223,11 @@ impl std::fmt::Display for NormalForm { } Self::Sequence { token, - token_output, actions, semact, .. } => { - if let Some(tk) = token_output { - styled_write!(f, Color::Yellow, "{token}[{tk}]")?; - } else { - styled_write!(f, Color::Yellow, "{token}")?; - } + styled_write!(f, Color::Yellow, "{token}")?; for action in actions.iter() { write!(f, "\t{}", action)?; } @@ -288,14 +244,14 @@ fn debug_print_test() { use quote::format_ident; let sequence = NormalForm::Sequence { token: format_ident!("TEST"), - token_output: Some(format_ident!("x")), actions: vec![ Action::Shift { tag: Tag::Toplevel(format_ident!("a")), output: None, }, Action::Reduce { - tag: Tag::Toplevel(format_ident!("b")), + semact: SemAct::Gather, + hint: None, output: Some(format_ident!("x")), }, Action::Shift { @@ -303,7 +259,8 @@ fn debug_print_test() { output: Some(format_ident!("y")), }, Action::Reduce { - tag: Tag::Anonymous(1), + semact: SemAct::Gather, + hint: None, output: None, }, ], @@ -366,14 +323,14 @@ fn debug_print_nf_table() { use quote::format_ident; let sequence = NormalForm::Sequence { token: format_ident!("TEST"), - token_output: Some(format_ident!("x")), actions: vec![ Action::Shift { tag: Tag::Toplevel(format_ident!("a")), output: None, }, Action::Reduce { - tag: Tag::Toplevel(format_ident!("b")), + semact: SemAct::Gather, + hint: None, output: Some(format_ident!("x")), }, Action::Shift { @@ -381,7 +338,8 @@ fn debug_print_nf_table() { output: Some(format_ident!("y")), }, Action::Reduce { - tag: Tag::Anonymous(1), + semact: SemAct::Gather, + hint: None, output: None, }, ], @@ -391,11 +349,13 @@ fn debug_print_nf_table() { let empty = NormalForm::Empty { actions: vec![ Action::Reduce { - tag: Tag::Toplevel(format_ident!("b")), + semact: SemAct::Gather, + hint: None, output: Some(format_ident!("x")), }, Action::Reduce { - tag: Tag::Toplevel(format_ident!("c")), + semact: SemAct::Gather, + hint: None, output: Some(format_ident!("y")), }, ], diff --git a/pag-parser2/src/nf/normalization.rs b/pag-parser2/src/nf/normalization.rs index 8b13789..53c74ba 100644 --- a/pag-parser2/src/nf/normalization.rs +++ b/pag-parser2/src/nf/normalization.rs @@ -1 +1,136 @@ +use std::{collections::HashMap, rc::Rc}; +use syn::Type; + +use crate::utils::Appendix; + +use super::{NFTable, Tag, translation::Translation, NormalForm, Action}; + +pub struct Normalized { + nfs: NFTable, + hints: HashMap> +} + +impl Normalized { + fn normalize(&mut self) { + loop { + let mut updates = Vec::new(); + for (target, nfs) in self.nfs.iter().map(|(k, v)| (k.clone(), v.clone())) { + if !nfs.iter().any(|x| matches!(x, NormalForm::Unexpanded {.. })) { + continue; + } + let mut stepped = Vec::new(); + for i in nfs { + let NormalForm::Unexpanded{ actions, semact, ty } = i else { + stepped.push(i); + continue; + }; + let first_subroutine = actions.iter().enumerate().find_map(|(index, act)| { + if let Action::Shift { tag, output } = act { + Some((index, tag, output.clone())) + } else { + None + } + }); + match first_subroutine { + None => { + stepped.push(NormalForm::Empty{ actions, semact, ty }); + } + Some((index, tag, output)) => { + let variable_nf = self.nfs.get(tag).cloned().expect("tag must have associated"); + for k in variable_nf { + let head = actions[..index].iter().cloned(); + let tail = actions[index + 1..].iter().cloned(); + match k { + NormalForm::Empty { actions: mut expanded_actions, semact: expanded_semact, .. } => { + let hint = self.hints.get(tag).cloned().map(Appendix); + expanded_actions.push(Action::Reduce { semact: expanded_semact, hint, output: output.clone()}); + let acts = head.chain(expanded_actions).chain(tail).collect(); + stepped.push(NormalForm::Unexpanded { actions: acts, semact: semact.clone(), ty: ty.clone() }); + } + NormalForm::Unexpanded { actions: mut expanded_actions, semact: expanded_semact, .. } => { + let hint = self.hints.get(tag).cloned().map(Appendix); + expanded_actions.push(Action::Reduce { semact: expanded_semact, hint, output: output.clone()}); + let acts = head.chain(expanded_actions).chain(tail).collect(); + stepped.push(NormalForm::Unexpanded { actions: acts, semact: semact.clone(), ty: ty.clone() }); + } + NormalForm::Sequence { + token, + actions: mut expanded_actions, + semact: expanded_semact, + .. + } => { + let hint = self.hints.get(tag).cloned().map(Appendix); + expanded_actions.push(Action::Reduce { semact: expanded_semact, hint, output: output.clone()}); + let acts = head.chain(expanded_actions).chain(tail).collect(); + stepped.push(NormalForm::Sequence { token, actions: acts, semact: semact.clone(), ty: ty.clone() }); + } + } + } + } + } + } + updates.push((target, stepped)); + } + if updates.is_empty() { + break; + } else { + for (k, v) in updates { + self.nfs.insert(k, v); + } + } + } + } +} + +impl From for Normalized { + fn from(value: Translation) -> Self { + let mut normalized = Self { + nfs: value.semi_nfs, + hints: value.hints + }; + normalized.normalize(); + normalized + } +} + +#[cfg(test)] +mod test { + use crate::{frontend::Ast, nf::normalization::Normalized}; + + use super::Translation; + + #[test] + fn sexpr() { + let ast = syn::parse_str::( + r#" + %entry = sexp; + + DIGIT = '0'..'9'; + ALPHA = 'a'..'z' | 'A'..'Z'; + LPAREN = "("; + RPAREN = ")"; + ATOM = ALPHA (ALPHA | DIGIT)*; + %skip = (" " | "\t" | "\n" | "\r")+; + + compound: SExp = LPAREN sexp+[sexp:Vec<_>] RPAREN { SExp::Compound(sexp) }; + atom : SExp = ATOM[atom] { SExp::Atom(atom) }; + sexp : SExp = compound + | atom; + "#, + ) + .unwrap(); + #[cfg(feature = "debug")] + { + let translation = Translation::from(&ast); + println!("{}", translation.semi_nfs); + let normalized = Normalized::from(translation); + println!("{}", normalized.nfs); + } + #[cfg(not(feature = "debug"))] + { + let translation = Translation::from(&ast); + let _ = Normalized::from(translation); + } + } +} \ No newline at end of file diff --git a/pag-parser2/src/nf/translation.rs b/pag-parser2/src/nf/translation.rs index 4f81cc0..0e64ad3 100644 --- a/pag-parser2/src/nf/translation.rs +++ b/pag-parser2/src/nf/translation.rs @@ -3,6 +3,7 @@ //! use std::collections::HashMap; +use std::io::Read; use std::rc::Rc; use quote::format_ident; @@ -12,13 +13,13 @@ use super::{semact::SemAct, Action, NormalForm, Tag}; use super::{AbstractType, NFTable}; use crate::frontend::{Ast, ParserDef, ParserExpr, SequenceIterator}; -struct Translation { +pub struct Translation { /// Table of semi-normalized production rules - semi_nfs: NFTable, + pub semi_nfs: NFTable, /// Toplevel type annotations annotations: HashMap>, /// Type hints when calling inner routines (collector) - hints: HashMap>, + pub hints: HashMap>, /// Counter of assigned non-explicit variable names output_cnt: usize, /// Counter of assigned anonymous routines @@ -154,7 +155,6 @@ impl Translation { let ty = self.infer_type(types.into_iter(), &semact, tag).into(); NormalForm::Sequence { token: token.clone(), - token_output: None, actions, semact, ty, @@ -165,7 +165,14 @@ impl Translation { if named.is_some() { types.push(AbstractType::span_type()) } - let actions = iter + let head_action = if matches!(semact, SemAct::Recognize) { + None + } else if IGNORE_UNNAMED && named.is_none() { + None + } else { + Some( Action::Reduce { semact: SemAct::Option, hint: None, output: named.or_else(|| Some(self.new_output_sym())) } ) + }; + let actions = head_action.into_iter().chain(iter .map(|(inner, named, hint)| { let (tag, output, ty) = self.add_anonymous_rule::(inner, named); @@ -176,18 +183,11 @@ impl Translation { types.push(ty); } Action::Shift { tag, output } - }) + })) .collect(); let ty = self.infer_type(types.into_iter(), &semact, tag).into(); NormalForm::Sequence { token: token.clone(), - token_output: if matches!(semact, SemAct::Recognize) { - None - } else if IGNORE_UNNAMED { - named - } else { - named.or_else(|| Some(self.new_output_sym())) - }, actions, semact, ty, @@ -341,7 +341,6 @@ impl Translation { let nf = if self.ignoring() { NormalForm::Sequence { token: ident.clone(), - token_output: None, actions: vec![], semact: SemAct::Recognize, ty: AbstractType::unit_type().into(), @@ -349,7 +348,6 @@ impl Translation { } else { NormalForm::Sequence { token: ident.clone(), - token_output: Some(self.new_output_sym()), actions: vec![], semact: SemAct::Token, ty: AbstractType::span_type().into(), diff --git a/pag-parser2/src/utils.rs b/pag-parser2/src/utils.rs index 32ee6dc..632465d 100644 --- a/pag-parser2/src/utils.rs +++ b/pag-parser2/src/utils.rs @@ -28,6 +28,7 @@ pub(crate) use styled; pub(crate) use styled_write; /// Appendix that does not count in equality/ordinality/hashing. +#[repr(transparent)] #[derive(Clone)] pub struct Appendix(pub T);