From 73475707777153113d5214cbb50449ffaa5c3c6e Mon Sep 17 00:00:00 2001 From: Philip Sampaio Date: Wed, 15 May 2024 01:37:06 -0300 Subject: [PATCH 1/2] Fix parsing of documents that may contain XML before Doctype This is a fix for malformed documents that may start with an XML tag, or even a comment before the declaration of the doctype. --- native/html5ever_nif/src/flat_dom.rs | 10 ++++++- test/html5ever_test.exs | 42 ++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+), 1 deletion(-) diff --git a/native/html5ever_nif/src/flat_dom.rs b/native/html5ever_nif/src/flat_dom.rs index da2eee3..960f5f4 100644 --- a/native/html5ever_nif/src/flat_dom.rs +++ b/native/html5ever_nif/src/flat_dom.rs @@ -526,6 +526,8 @@ pub fn flat_sink_to_rec_term<'a>( child_base: 0, child_n: 0, }]; + let mut comments_bf_doctype = 0u16; + let mut read_doctype = false; loop { let mut top = stack.pop().unwrap(); @@ -567,7 +569,9 @@ pub fn flat_sink_to_rec_term<'a>( system_id, } => { assert!(!stack.is_empty()); - assert!(child_stack.is_empty()); + assert!(child_stack.is_empty() || comments_bf_doctype > 0); + + read_doctype = true; term = ( atoms::doctype(), @@ -596,6 +600,10 @@ pub fn flat_sink_to_rec_term<'a>( term = StrTendrilWrapper(contents).encode(env); } NodeData::Comment { contents } => { + if !read_doctype { + comments_bf_doctype += 1 + }; + term = (atoms::comment(), StrTendrilWrapper(contents)).encode(env); } _ => unimplemented!(""), diff --git a/test/html5ever_test.exs b/test/html5ever_test.exs index 9fbbae5..44797dc 100644 --- a/test/html5ever_test.exs +++ b/test/html5ever_test.exs @@ -308,4 +308,46 @@ defmodule Html5everTest do ]} ]} end + + test "parse html starting with a XML tag" do + html = """ + + + + + Hello + + link + + + """ + + assert Html5ever.parse(html) == + {:ok, + [ + {:comment, "?xml version=\"1.0\" encoding=\"UTF-8\"?"}, + {:comment, " also a comment is allowed "}, + {:doctype, "html", "-//W3C//DTD XHTML 1.0 Strict//EN", + "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"}, + { + "html", + [{"xmlns", "http://www.w3.org/1999/xhtml"}, {"xml:lang", "en"}, {"lang", "en"}], + [ + {"head", [], [{"title", [], ["Hello"]}]}, + "\n", + " ", + {"body", [], + [ + "\n", + " ", + {"a", [{"id", "anchor"}, {"href", "https://example.com"}], ["link"]}, + "\n", + " ", + "\n", + "\n" + ]} + ] + } + ]} + end end From dac4f70db291d2e6c3e0f4b3fe2d8defb87e0c65 Mon Sep 17 00:00:00 2001 From: Philip Sampaio Date: Wed, 15 May 2024 15:10:29 -0300 Subject: [PATCH 2/2] Change assertion to check comments len --- native/html5ever_nif/src/flat_dom.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/native/html5ever_nif/src/flat_dom.rs b/native/html5ever_nif/src/flat_dom.rs index 960f5f4..f1d0c8f 100644 --- a/native/html5ever_nif/src/flat_dom.rs +++ b/native/html5ever_nif/src/flat_dom.rs @@ -526,7 +526,7 @@ pub fn flat_sink_to_rec_term<'a>( child_base: 0, child_n: 0, }]; - let mut comments_bf_doctype = 0u16; + let mut comments_bf_doctype = 0usize; let mut read_doctype = false; loop { @@ -569,7 +569,7 @@ pub fn flat_sink_to_rec_term<'a>( system_id, } => { assert!(!stack.is_empty()); - assert!(child_stack.is_empty() || comments_bf_doctype > 0); + assert!(child_stack.is_empty() || comments_bf_doctype == child_stack.len()); read_doctype = true;