From 66b098e0ce97de718a3399cee90fca97ccc58aea Mon Sep 17 00:00:00 2001 From: Bruno Paulino Date: Sun, 2 Jun 2024 12:25:46 +0200 Subject: [PATCH] Flat AST with types (#10) --- md_parser/src/ast.rs | 49 +++++--- md_parser/src/lexer.rs | 2 + md_parser/src/parser.rs | 37 +++--- md_parser/src/renderer.rs | 52 ++++---- ...rser__tests__parse_markdown@input2.md.snap | 116 +++++++++--------- ..._tests__parse_markdown@lexer_input.md.snap | 74 +++++------ 6 files changed, 178 insertions(+), 152 deletions(-) diff --git a/md_parser/src/ast.rs b/md_parser/src/ast.rs index 2dbbf9d..3b71144 100644 --- a/md_parser/src/ast.rs +++ b/md_parser/src/ast.rs @@ -2,26 +2,47 @@ use serde::{Deserialize, Serialize}; #[derive(Debug, Serialize, Deserialize, PartialEq, Eq)] pub enum Node<'s> { + Header(Header<'s>), + Paragraph(Paragraph<'s>), + Link(Link<'s>), + Bold(Bold<'s>), + Italic(Italic<'s>), + Digit(&'s str), + Text(&'s str), + LineBreak, +} + +#[derive(Debug, Serialize, Deserialize, PartialEq, Eq)] +pub struct Header<'s> { + pub level: u8, #[serde(borrow)] - Block(BlockNode<'s>), - Inline(InlineNode<'s>), + pub children: Vec>, } -/// Block level elements #[derive(Debug, Serialize, Deserialize, PartialEq, Eq)] -pub enum BlockNode<'s> { - Heading(u8, Vec>), // (heading level, elements) +pub struct Paragraph<'s> { #[serde(borrow)] - Paragraph(Vec>), + pub children: Vec>, } -/// inline level elements #[derive(Debug, Serialize, Deserialize, PartialEq, Eq)] -pub enum InlineNode<'s> { - Bold(Vec>), - Italic(Vec>), - Link(Vec>, Vec>), // (elements, url) - Digit(usize), - Text(&'s str), - LineBreak, +pub struct Link<'s> { + #[serde(borrow)] + pub children: Vec>, + /// List of Text nodes + pub url: Vec>, + // TODO: Support title for tooltips + // title: Option<&'s str> +} + +#[derive(Debug, Serialize, Deserialize, PartialEq, Eq)] +pub struct Bold<'s> { + #[serde(borrow)] + pub children: Vec>, +} + +#[derive(Debug, Serialize, Deserialize, PartialEq, Eq)] +pub struct Italic<'s> { + #[serde(borrow)] + pub children: Vec>, } diff --git a/md_parser/src/lexer.rs b/md_parser/src/lexer.rs index 891e5d3..78c5ebe 100644 --- a/md_parser/src/lexer.rs +++ b/md_parser/src/lexer.rs @@ -64,6 +64,8 @@ impl<'a> Lexer<'a> { } } + /// A token can only be within the ASCII space + /// and must belong into our list of reserved symbols fn is_token(&self, c: Option) -> bool { match c { Some(c) => { diff --git a/md_parser/src/parser.rs b/md_parser/src/parser.rs index 87713e5..d7c22c2 100644 --- a/md_parser/src/parser.rs +++ b/md_parser/src/parser.rs @@ -1,4 +1,4 @@ -use crate::ast::{BlockNode, InlineNode, Node}; +use crate::ast::{Bold, Header, Link, Node, Paragraph}; use crate::token::{Span, Token}; use std::cmp::max; @@ -112,15 +112,15 @@ impl<'source> Parser<'source> { if heading_level > 0 && heading_level <= 6 && self.match_token(Token::Space) { let mut inline_elements = Vec::new(); while let Some(inline) = self.inline(Parent::Block) { - if inline == InlineNode::LineBreak { + if inline == Node::LineBreak { break; } inline_elements.push(inline) } - return Some(Node::Block(BlockNode::Heading( - heading_level, - inline_elements, - ))); + return Some(Node::Header(Header { + level: heading_level, + children: inline_elements, + })); } // in case of detected hashes, at this point, @@ -152,10 +152,12 @@ impl<'source> Parser<'source> { return None; } - Some(Node::Block(BlockNode::Paragraph(inline_elements))) + Some(Node::Paragraph(Paragraph { + children: inline_elements, + })) } - fn inline(&mut self, parent: Parent) -> Option> { + fn inline(&mut self, parent: Parent) -> Option> { if self.is_at_end() { return None; } @@ -167,7 +169,7 @@ impl<'source> Parser<'source> { Token::Newline if self.check_next(Token::Newline) => { return None; } - Token::Newline => InlineNode::LineBreak, + Token::Newline => Node::LineBreak, Token::Star => return self.maybe_bold(), Token::LeftSquareBracket if parent == Parent::Block => return self.maybe_link(), Token::Text(_) @@ -182,7 +184,7 @@ impl<'source> Parser<'source> { | Token::RightParen | Token::LeftSquareBracket | Token::RightSquareBracket - | Token::Backslash => InlineNode::Text(token.literal()), + | Token::Backslash => Node::Text(token.literal()), t if t.is_block_level_token() => return None, t => todo!("unhandled token: {}", t), }; @@ -193,7 +195,7 @@ impl<'source> Parser<'source> { None } - fn maybe_link(&mut self) -> Option> { + fn maybe_link(&mut self) -> Option> { let mut markers: [u8; 4] = [0, 0, 0, 0]; let rewind_position = self.current; // Any inline element can partially show-up and should be represented as text, @@ -252,16 +254,19 @@ impl<'source> Parser<'source> { self.consume(&Token::RightParen); - return Some(InlineNode::Link(link_text, url)); + return Some(Node::Link(Link { + children: link_text, + url, + })); } // Otherwise we bail, rewind and let the next loop handle // each token as as normal text or other inline elements self.consume(&Token::LeftSquareBracket); - Some(InlineNode::Text(Token::LeftSquareBracket.literal())) + Some(Node::Text(Token::LeftSquareBracket.literal())) } - fn maybe_bold(&mut self) -> Option> { + fn maybe_bold(&mut self) -> Option> { let mut markers: [u8; 2] = [0, 0]; let rewind_position = self.current; 'outer: while markers != [1, 1] && !self.is_at_end() { @@ -303,14 +308,14 @@ impl<'source> Parser<'source> { // Consume the wrapping "**" around bold tokens self.consume(&Token::Star); self.consume(&Token::Star); - return Some(InlineNode::Bold(inner)); + return Some(Node::Bold(Bold { children: inner })); } // Otherwise we bail, rewind and let the next loop handle each token // be handled as normal text or other inline elements self.rewind(rewind_position); self.consume(&Token::Star); - Some(InlineNode::Text(Token::Star.literal())) + Some(Node::Text(Token::Star.literal())) } fn consume(&mut self, kind: &Token) -> &Token { diff --git a/md_parser/src/renderer.rs b/md_parser/src/renderer.rs index ed7f366..0e2ace9 100644 --- a/md_parser/src/renderer.rs +++ b/md_parser/src/renderer.rs @@ -1,8 +1,4 @@ -use crate::{ - ast::{BlockNode, InlineNode, Node}, - lexer::Lexer, - parser::Parser, -}; +use crate::{ast::Node, lexer::Lexer, parser::Parser}; /// Renders an HTML string from the given AST /// @@ -31,60 +27,62 @@ fn render(ast: Vec) -> String { fn visit(buffer: &mut String, node: &Node) { match node { - Node::Block(block) => visit_block(buffer, block), - Node::Inline(inline) => visit_inline(buffer, inline), + Node::Header(_) | Node::Paragraph(_) => visit_block(buffer, node), + node => visit_inline(buffer, node), } } -fn visit_block(buffer: &mut String, node: &BlockNode) { +fn visit_block(buffer: &mut String, node: &Node) { match node { - BlockNode::Heading(level, inline_nodes) => { - buffer.push_str(&format!("", level)); - visit_inline_nodes(buffer, inline_nodes); - buffer.push_str(&format!("", level)); + Node::Header(header) => { + buffer.push_str(&format!("", header.level)); + visit_inline_nodes(buffer, &header.children); + buffer.push_str(&format!("", header.level)); } - BlockNode::Paragraph(inline_nodes) => { + Node::Paragraph(paragraph) => { buffer.push_str("

"); - for (idx, node) in inline_nodes.iter().enumerate() { + for (idx, node) in paragraph.children.iter().enumerate() { // Within a paragraph, whenever we hit the last node // and it's a newline, we can just discard it as the // paragraph element behaves itself as a block. - if idx >= inline_nodes.len() - 1 && node == &InlineNode::LineBreak { + if idx >= paragraph.children.len() - 1 && node == &Node::LineBreak { continue; } visit_inline(buffer, node); } buffer.push_str("

"); } + _ => panic!("Node {:#?} not supported as a block node type", node), } } -fn visit_inline(buffer: &mut String, node: &InlineNode) { +fn visit_inline(buffer: &mut String, node: &Node) { match node { - InlineNode::Text(txt) => buffer.push_str(txt), - InlineNode::Bold(inline_nodes) => { + Node::Text(txt) => buffer.push_str(txt), + Node::Bold(bold) => { buffer.push_str(""); - visit_inline_nodes(buffer, inline_nodes); + visit_inline_nodes(buffer, &bold.children); buffer.push_str(""); } - InlineNode::Digit(d) => buffer.push_str(&d.to_string()), - InlineNode::LineBreak => buffer.push_str("
"), - InlineNode::Italic(inline_nodes) => { + Node::Digit(d) => buffer.push_str(d), + Node::LineBreak => buffer.push_str("
"), + Node::Italic(italic) => { buffer.push_str(""); - visit_inline_nodes(buffer, inline_nodes); + visit_inline_nodes(buffer, &italic.children); buffer.push_str(""); } - InlineNode::Link(text_nodes, link_nodes) => { + Node::Link(link) => { buffer.push_str(r#""#); - visit_inline_nodes(buffer, text_nodes); + visit_inline_nodes(buffer, &link.children); buffer.push_str(""); } + _ => panic!("Node {:#?} not supported as a inline node type", node), } } -fn visit_inline_nodes(buffer: &mut String, nodes: &[InlineNode]) { +fn visit_inline_nodes(buffer: &mut String, nodes: &[Node]) { for inline in nodes.iter() { visit_inline(buffer, inline); } diff --git a/md_parser/src/snapshots/md_parser__parser__tests__parse_markdown@input2.md.snap b/md_parser/src/snapshots/md_parser__parser__tests__parse_markdown@input2.md.snap index c7c8c4a..fec32a8 100644 --- a/md_parser/src/snapshots/md_parser__parser__tests__parse_markdown@input2.md.snap +++ b/md_parser/src/snapshots/md_parser__parser__tests__parse_markdown@input2.md.snap @@ -5,38 +5,36 @@ input_file: md_parser/src/snapshot_inputs/input2.md --- [ { - "Block": { - "Heading": [ - 3, - [ - { - "Text": "Article" - }, - { - "Text": " " - }, - { - "Text": "of" - }, - { - "Text": " " - }, - { - "Text": "the" - }, - { - "Text": " " - }, - { - "Text": "week" - } - ] + "Header": { + "level": 3, + "children": [ + { + "Text": "Article" + }, + { + "Text": " " + }, + { + "Text": "of" + }, + { + "Text": " " + }, + { + "Text": "the" + }, + { + "Text": " " + }, + { + "Text": "week" + } ] } }, { - "Block": { - "Paragraph": [ + "Paragraph": { + "children": [ { "Text": "In" }, @@ -68,8 +66,8 @@ input_file: md_parser/src/snapshot_inputs/input2.md "Text": " " }, { - "Link": [ - [ + "Link": { + "children": [ { "Text": "Prater" }, @@ -80,7 +78,7 @@ input_file: md_parser/src/snapshot_inputs/input2.md "Text": "Park" } ], - [ + "url": [ { "Text": "https://www" }, @@ -97,7 +95,7 @@ input_file: md_parser/src/snapshot_inputs/input2.md "Text": "com/en/home" } ] - ] + } }, { "Text": " " @@ -203,11 +201,13 @@ input_file: md_parser/src/snapshot_inputs/input2.md "Text": " " }, { - "Bold": [ - { - "Text": "seasons" - } - ] + "Bold": { + "children": [ + { + "Text": "seasons" + } + ] + } }, { "Text": "." @@ -216,8 +216,8 @@ input_file: md_parser/src/snapshot_inputs/input2.md } }, { - "Block": { - "Paragraph": [ + "Paragraph": { + "children": [ { "Text": "Particularly" }, @@ -255,11 +255,13 @@ input_file: md_parser/src/snapshot_inputs/input2.md "Text": " " }, { - "Bold": [ - { - "Text": "green" - } - ] + "Bold": { + "children": [ + { + "Text": "green" + } + ] + } }, { "Text": " " @@ -280,20 +282,18 @@ input_file: md_parser/src/snapshot_inputs/input2.md } }, { - "Block": { - "Heading": [ - 2, - [ - { - "Text": "Another" - }, - { - "Text": " " - }, - { - "Text": "header" - } - ] + "Header": { + "level": 2, + "children": [ + { + "Text": "Another" + }, + { + "Text": " " + }, + { + "Text": "header" + } ] } } diff --git a/md_parser/src/snapshots/md_parser__parser__tests__parse_markdown@lexer_input.md.snap b/md_parser/src/snapshots/md_parser__parser__tests__parse_markdown@lexer_input.md.snap index a281db2..b830edb 100644 --- a/md_parser/src/snapshots/md_parser__parser__tests__parse_markdown@lexer_input.md.snap +++ b/md_parser/src/snapshots/md_parser__parser__tests__parse_markdown@lexer_input.md.snap @@ -5,26 +5,24 @@ input_file: md_parser/src/snapshot_inputs/lexer_input.md --- [ { - "Block": { - "Heading": [ - 1, - [ - { - "Text": "Hi" - }, - { - "Text": " " - }, - { - "Text": "there" - } - ] + "Header": { + "level": 1, + "children": [ + { + "Text": "Hi" + }, + { + "Text": " " + }, + { + "Text": "there" + } ] } }, { - "Block": { - "Paragraph": [ + "Paragraph": { + "children": [ { "Text": "This" }, @@ -90,23 +88,25 @@ input_file: md_parser/src/snapshot_inputs/lexer_input.md }, "LineBreak", { - "Bold": [ - { - "Text": "bold" - }, - { - "Text": " " - }, - { - "Text": "text" - }, - { - "Text": " " - }, - { - "Text": "here" - } - ] + "Bold": { + "children": [ + { + "Text": "bold" + }, + { + "Text": " " + }, + { + "Text": "text" + }, + { + "Text": " " + }, + { + "Text": "here" + } + ] + } }, { "Text": " " @@ -130,8 +130,8 @@ input_file: md_parser/src/snapshot_inputs/lexer_input.md "Text": " " }, { - "Link": [ - [ + "Link": { + "children": [ { "Text": "link" }, @@ -142,7 +142,7 @@ input_file: md_parser/src/snapshot_inputs/lexer_input.md "Text": "here" } ], - [ + "url": [ { "Text": "https://bpaulino" }, @@ -153,7 +153,7 @@ input_file: md_parser/src/snapshot_inputs/lexer_input.md "Text": "com" } ] - ] + } }, { "Text": " "