Skip to content

Commit

Permalink
Fix parsing of documents that may contain XML before Doctype (#149)
Browse files Browse the repository at this point in the history
* Fix parsing of documents that may contain XML before Doctype

This is a fix for malformed documents that may start with an XML tag, or
even a comment before the declaration of the doctype.

* Change assertion to check comments len
  • Loading branch information
philss authored May 15, 2024
1 parent a092e27 commit 87239c3
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 1 deletion.
10 changes: 9 additions & 1 deletion native/html5ever_nif/src/flat_dom.rs
Original file line number Diff line number Diff line change
Expand Up @@ -526,6 +526,8 @@ pub fn flat_sink_to_rec_term<'a>(
child_base: 0,
child_n: 0,
}];
let mut comments_bf_doctype = 0usize;
let mut read_doctype = false;

loop {
let mut top = stack.pop().unwrap();
Expand Down Expand Up @@ -567,7 +569,9 @@ pub fn flat_sink_to_rec_term<'a>(
system_id,
} => {
assert!(!stack.is_empty());
assert!(child_stack.is_empty());
assert!(child_stack.is_empty() || comments_bf_doctype == child_stack.len());

read_doctype = true;

term = (
atoms::doctype(),
Expand Down Expand Up @@ -596,6 +600,10 @@ pub fn flat_sink_to_rec_term<'a>(
term = StrTendrilWrapper(contents).encode(env);
}
NodeData::Comment { contents } => {
if !read_doctype {
comments_bf_doctype += 1
};

term = (atoms::comment(), StrTendrilWrapper(contents)).encode(env);
}
_ => unimplemented!(""),
Expand Down
42 changes: 42 additions & 0 deletions test/html5ever_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -308,4 +308,46 @@ defmodule Html5everTest do
]}
]}
end

test "parse html starting with a XML tag" do
html = """
<?xml version="1.0" encoding="UTF-8"?>
<!-- also a comment is allowed -->
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head><title>Hello</title></head>
<body>
<a id="anchor" href="https://example.com">link</a>
</body>
</html>
"""

assert Html5ever.parse(html) ==
{:ok,
[
{:comment, "?xml version=\"1.0\" encoding=\"UTF-8\"?"},
{:comment, " also a comment is allowed "},
{:doctype, "html", "-//W3C//DTD XHTML 1.0 Strict//EN",
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"},
{
"html",
[{"xmlns", "http://www.w3.org/1999/xhtml"}, {"xml:lang", "en"}, {"lang", "en"}],
[
{"head", [], [{"title", [], ["Hello"]}]},
"\n",
" ",
{"body", [],
[
"\n",
" ",
{"a", [{"id", "anchor"}, {"href", "https://example.com"}], ["link"]},
"\n",
" ",
"\n",
"\n"
]}
]
}
]}
end
end

0 comments on commit 87239c3

Please sign in to comment.