Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle nested inline elements for links #11

Merged
merged 2 commits into from
Jun 5, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
191 changes: 132 additions & 59 deletions md_parser/src/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ use crate::ast::{Bold, Header, Link, Node, Paragraph};
use crate::token::{Span, Token};

use std::cmp::max;
use std::ops::Range;

// Markdown Grammar
// (* A document is a series of blocks *)
Expand Down Expand Up @@ -56,24 +57,15 @@ use std::cmp::max;
// title = ? any string ? ;
// alt_text = ? any string ? ;

// The given parent of an element when recursing
// on inner elements. Helpful for controlling whether
// we allow some types of elements to have chidren elements
#[derive(PartialEq, Eq)]
enum Parent {
Block,
Inline,
}

/// Recursive Descent Parser for transforming
/// the given list of tokens a DOM AST
pub struct Parser<'source> {
current: usize,
tokens: &'source Vec<(Token<'source>, Span)>,
tokens: &'source [(Token<'source>, Span)],
}

impl<'source> Parser<'source> {
pub fn new(tokens: &'source Vec<(Token<'source>, Span)>) -> Self {
pub fn new(tokens: &'source [(Token<'source>, Span)]) -> Self {
Self { tokens, current: 0 }
}

Expand All @@ -88,6 +80,21 @@ impl<'source> Parser<'source> {
nodes
}

/// Parser step for nested inline elements only.
/// Helpful for cases where we want to restrict parsing
/// for within a specific range of tokens within another inline element.
/// e.g. Links containing bold text and other allowed inline elements
fn parse_inline(&mut self) -> Vec<Node<'source>> {
let mut nodes = Vec::new();
while !self.is_at_end() {
if let Some(node) = self.inline() {
nodes.push(node);
}
}

nodes
}

fn block(&mut self) -> Option<Node<'source>> {
while self.check(&Token::Newline) {
self.consume(&Token::Newline);
Expand All @@ -111,7 +118,7 @@ impl<'source> Parser<'source> {

if heading_level > 0 && heading_level <= 6 && self.match_token(Token::Space) {
let mut inline_elements = Vec::new();
while let Some(inline) = self.inline(Parent::Block) {
while let Some(inline) = self.inline() {
if inline == Node::LineBreak {
break;
}
Expand Down Expand Up @@ -144,7 +151,7 @@ impl<'source> Parser<'source> {

let mut inline_elements = Vec::new();

while let Some(inline) = self.inline(Parent::Block) {
while let Some(inline) = self.inline() {
inline_elements.push(inline);
}

Expand All @@ -157,21 +164,26 @@ impl<'source> Parser<'source> {
}))
}

fn inline(&mut self, parent: Parent) -> Option<Node<'source>> {
fn inline(&mut self) -> Option<Node<'source>> {
if self.is_at_end() {
return None;
}

if let Some((token, _)) = self.peek() {
let node = match token {
// Hitting end of the file, just advance and halt
Token::EndOfFile => {
self.advance();
return None;
}
// Two consecutive newlines should break off from any inline elements
// and give it a chance to a new block or inline element to be constructed
Token::Newline if self.check_next(Token::Newline) => {
return None;
}
Token::Newline => Node::LineBreak,
Token::Star => return self.maybe_bold(),
Token::LeftSquareBracket if parent == Parent::Block => return self.maybe_link(),
Token::LeftSquareBracket => return self.maybe_link(),
Token::Text(_)
| Token::Digit(_)
| Token::Space
Expand All @@ -182,11 +194,12 @@ impl<'source> Parser<'source> {
| Token::Hash
| Token::LeftParen
| Token::RightParen
| Token::LeftSquareBracket
| Token::RightSquareBracket
| Token::Backslash => Node::Text(token.literal()),
// block-level tokens should be interpreted outside of the inline loop
// to give them a chance of being interpreted as block-level elements
t if t.is_block_level_token() => return None,
t => todo!("unhandled token: {}", t),
t => todo!("Token not handled yet: {}", t),
};
self.advance();
return Some(node);
Expand All @@ -196,26 +209,32 @@ impl<'source> Parser<'source> {
}

fn maybe_link(&mut self) -> Option<Node<'source>> {
let mut markers: [u8; 4] = [0, 0, 0, 0];
let mut marker = LinkMarker::new();
let rewind_position = self.current;
let mut steps = 0;
// Any inline element can partially show-up and should be represented as text,
// but if we find the right token makers that can complete a link, we should
// rewind and structure it as a Link inline node instead.
'outer: while markers != [1, 1, 1, 1] && !self.is_at_end() {
while let Some((next, _)) = self.advance() {
while !marker.is_link() && !self.is_at_end() {
if let Some((next, _)) = self.advance() {
steps += 1;
match next {
Token::LeftSquareBracket if markers == [0, 0, 0, 0] => markers[0] = 1,
Token::LeftSquareBracket if marker.is_empty() => {
marker.set_start_text(self.current)
}
// The closing text of a link must be followed by "]("
Token::RightSquareBracket if markers == [1, 0, 0, 0] => {
Token::RightSquareBracket if marker.has_open_text() => {
if self.peek_token().is_some_and(|t| t == &Token::LeftParen) {
markers[1] = 1;
markers[2] = 1;
marker.set_end_text(self.current - 1);
marker.set_start_url(self.current + 1);
}
}
Token::RightParen if markers == [1, 1, 1, 0] => markers[3] = 1,
Token::RightParen if marker.has_open_url() => {
marker.set_end_url(self.current - 1)
}
token if token == &Token::Newline => {
if let Some(&(Token::Newline, _)) = self.peek() {
break 'outer;
break;
}
}
_ => {}
Expand All @@ -227,37 +246,20 @@ impl<'source> Parser<'source> {

// We are guaranteed to have a well-structured link here
// lets force-consume all the special tokens
if markers == [1, 1, 1, 1] {
self.consume(&Token::LeftSquareBracket);
if let Some((text_range, url_range)) = marker.ranges() {
let mut text_parser = Self::new(&self.tokens[text_range]);
let text_nodes = text_parser.parse_inline();

let mut link_text = Vec::new();

while !self.check(&Token::RightSquareBracket) && !self.is_at_end() {
if let Some(inline) = self.inline(Parent::Inline) {
link_text.push(inline);
} else {
break;
}
}

self.consume(&Token::RightSquareBracket);
self.consume(&Token::LeftParen);

let mut url = Vec::new();
while !self.check(&Token::RightParen) && !self.is_at_end() {
if let Some(inline) = self.inline(Parent::Inline) {
url.push(inline);
} else {
break;
}
}
let mut url_parser = Self::new(&self.tokens[url_range]);
let url_nodes = url_parser.parse_inline();
self.current += steps;

self.consume(&Token::RightParen);
let link = Node::Link(Link {
children: text_nodes,
url: url_nodes,
});

return Some(Node::Link(Link {
children: link_text,
url,
}));
return Some(link);
}

// Otherwise we bail, rewind and let the next loop handle
Expand Down Expand Up @@ -299,7 +301,7 @@ impl<'source> Parser<'source> {
self.consume(&Token::Star);
let mut inner = Vec::new();
while !self.check(&Token::Star) && !self.is_at_end() {
if let Some(inline) = self.inline(Parent::Inline) {
if let Some(inline) = self.inline() {
inner.push(inline);
} else {
panic!("Invalid inline node for link URL component");
Expand Down Expand Up @@ -402,10 +404,81 @@ impl<'source> Parser<'source> {
}

fn is_at_end(&self) -> bool {
self.tokens
.get(self.current)
.filter(|t| t.0 == Token::EndOfFile)
.is_some()
self.current >= self.tokens.len()
}
}

#[derive(Debug)]
struct LinkMarker {
start_text: Option<usize>,
end_text: Option<usize>,
start_url: Option<usize>,
end_url: Option<usize>,
}

/// helful for holding the boundaries of a Link element during parsing
impl LinkMarker {
fn new() -> Self {
Self {
start_text: None,
end_text: None,
start_url: None,
end_url: None,
}
}

fn set_start_text(&mut self, index: usize) {
self.start_text = Some(index);
}

fn set_end_text(&mut self, index: usize) {
self.end_text = Some(index);
}

fn set_start_url(&mut self, index: usize) {
self.start_url = Some(index);
}

fn set_end_url(&mut self, index: usize) {
self.end_url = Some(index);
}

fn is_link(&self) -> bool {
self.start_text.is_some()
&& self.end_text.is_some()
&& self.start_url.is_some()
&& self.end_url.is_some()
}

fn is_empty(&self) -> bool {
self.start_text.is_none()
&& self.end_text.is_none()
&& self.start_url.is_none()
&& self.end_url.is_none()
}

fn has_open_text(&self) -> bool {
self.start_text.is_some()
&& self.end_text.is_none()
&& self.start_url.is_none()
&& self.end_url.is_none()
}

fn has_open_url(&self) -> bool {
self.start_text.is_some()
&& self.end_text.is_some()
&& self.start_url.is_some()
&& self.end_url.is_none()
}

/// given a complete link, extract the ranges of its inner components
fn ranges(&self) -> Option<(Range<usize>, Range<usize>)> {
match (self.start_text, self.end_text, self.start_url, self.end_url) {
(Some(text_start), Some(text_end), Some(url_start), Some(url_end)) => {
Some((text_start..text_end, url_start..url_end))
}
_ => None,
}
}
}

Expand Down