Merge pull request #2924 from ehuss/html-end-tags
Add better handling for unbalanced HTML tags
This commit is contained in:
commit
7d1566860c
2 changed files with 132 additions and 44 deletions
|
|
@ -19,7 +19,7 @@ use pulldown_cmark::{Alignment, CodeBlockKind, CowStr, Event, LinkType, Tag, Tag
|
||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
use std::collections::{HashMap, HashSet};
|
use std::collections::{HashMap, HashSet};
|
||||||
use std::ops::Deref;
|
use std::ops::Deref;
|
||||||
use tracing::{error, trace, warn};
|
use tracing::{trace, warn};
|
||||||
|
|
||||||
/// Helper to create a [`QualName`].
|
/// Helper to create a [`QualName`].
|
||||||
macro_rules! attr_qual_name {
|
macro_rules! attr_qual_name {
|
||||||
|
|
@ -307,6 +307,8 @@ where
|
||||||
match event {
|
match event {
|
||||||
Event::Start(tag) => self.start_tag(tag),
|
Event::Start(tag) => self.start_tag(tag),
|
||||||
Event::End(tag) => {
|
Event::End(tag) => {
|
||||||
|
// TODO: This should validate that the event stack is
|
||||||
|
// properly synchronized with the tag stack.
|
||||||
self.pop();
|
self.pop();
|
||||||
match tag {
|
match tag {
|
||||||
TagEnd::TableHead => {
|
TagEnd::TableHead => {
|
||||||
|
|
@ -378,6 +380,7 @@ where
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
self.finish_stack();
|
||||||
self.collect_footnote_defs();
|
self.collect_footnote_defs();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -606,40 +609,10 @@ where
|
||||||
trace!("html token={token:?}");
|
trace!("html token={token:?}");
|
||||||
match token {
|
match token {
|
||||||
Token::DoctypeToken(_) => {}
|
Token::DoctypeToken(_) => {}
|
||||||
Token::TagToken(tag) => {
|
Token::TagToken(tag) => match tag.kind {
|
||||||
match tag.kind {
|
TagKind::StartTag => self.start_html_tag(tag, &mut is_raw),
|
||||||
TagKind::StartTag => {
|
TagKind::EndTag => self.end_html_tag(tag, &mut is_raw),
|
||||||
let is_closed = is_void_element(&tag.name) || tag.self_closing;
|
},
|
||||||
is_raw = matches!(&*tag.name, "script" | "style");
|
|
||||||
let name = QualName::new(None, html5ever::ns!(html), tag.name);
|
|
||||||
let attrs = tag
|
|
||||||
.attrs
|
|
||||||
.into_iter()
|
|
||||||
.map(|attr| (attr.name, attr.value))
|
|
||||||
.collect();
|
|
||||||
let mut el = Element {
|
|
||||||
name,
|
|
||||||
attrs,
|
|
||||||
self_closing: tag.self_closing,
|
|
||||||
was_raw: true,
|
|
||||||
};
|
|
||||||
fix_html_link(&mut el);
|
|
||||||
self.push(Node::Element(el));
|
|
||||||
if is_closed {
|
|
||||||
// No end element.
|
|
||||||
self.pop();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
TagKind::EndTag => {
|
|
||||||
is_raw = false;
|
|
||||||
if self.is_html_tag_matching(&tag.name) {
|
|
||||||
self.pop();
|
|
||||||
}
|
|
||||||
// else the stack is corrupt. I'm not really sure
|
|
||||||
// what to do here...
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Token::CommentToken(comment) => {
|
Token::CommentToken(comment) => {
|
||||||
self.append(Node::Comment(comment));
|
self.append(Node::Comment(comment));
|
||||||
}
|
}
|
||||||
|
|
@ -664,22 +637,59 @@ where
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Adds an open HTML tag.
|
||||||
|
fn start_html_tag(&mut self, tag: html5ever::tokenizer::Tag, is_raw: &mut bool) {
|
||||||
|
let is_closed = is_void_element(&tag.name) || tag.self_closing;
|
||||||
|
*is_raw = matches!(&*tag.name, "script" | "style");
|
||||||
|
let name = QualName::new(None, html5ever::ns!(html), tag.name);
|
||||||
|
let attrs = tag
|
||||||
|
.attrs
|
||||||
|
.into_iter()
|
||||||
|
.map(|attr| (attr.name, attr.value))
|
||||||
|
.collect();
|
||||||
|
let mut el = Element {
|
||||||
|
name,
|
||||||
|
attrs,
|
||||||
|
self_closing: tag.self_closing,
|
||||||
|
was_raw: true,
|
||||||
|
};
|
||||||
|
fix_html_link(&mut el);
|
||||||
|
self.push(Node::Element(el));
|
||||||
|
if is_closed {
|
||||||
|
// No end element.
|
||||||
|
self.pop();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Closes the given HTML tag.
|
||||||
|
fn end_html_tag(&mut self, tag: html5ever::tokenizer::Tag, is_raw: &mut bool) {
|
||||||
|
*is_raw = false;
|
||||||
|
if self.is_html_tag_matching(&tag.name) {
|
||||||
|
self.pop();
|
||||||
|
} else {
|
||||||
|
// The proper thing to do here is to recover. However, the HTML
|
||||||
|
// parsing algorithm for that is quite complex. See
|
||||||
|
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inbody
|
||||||
|
// and the adoption agency algorithm.
|
||||||
|
warn!(
|
||||||
|
"unexpected HTML end tag `</{}>` found in `{}`\n\
|
||||||
|
Check that the HTML tags are properly balanced.",
|
||||||
|
tag.name,
|
||||||
|
self.options.path.display()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// This is used to verify HTML parsing keeps the stack of tags in sync.
|
/// This is used to verify HTML parsing keeps the stack of tags in sync.
|
||||||
fn is_html_tag_matching(&self, name: &str) -> bool {
|
fn is_html_tag_matching(&self, name: &str) -> bool {
|
||||||
let current = self.tree.get(self.current_node).unwrap().value();
|
let current = self.tree.get(self.current_node).unwrap().value();
|
||||||
if let Node::Element(el) = current
|
if let Node::Element(el) = current
|
||||||
&& el.name() == name
|
&& el.name() == name
|
||||||
{
|
{
|
||||||
return true;
|
true
|
||||||
|
} else {
|
||||||
|
false
|
||||||
}
|
}
|
||||||
error!(
|
|
||||||
"internal error: HTML tag stack out of sync.\n
|
|
||||||
path: `{}`\n\
|
|
||||||
current={current:?}\n\
|
|
||||||
pop name: {name}",
|
|
||||||
self.options.path.display()
|
|
||||||
);
|
|
||||||
false
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Eats all pulldown-cmark events until the next `End` matching the
|
/// Eats all pulldown-cmark events until the next `End` matching the
|
||||||
|
|
@ -736,6 +746,40 @@ where
|
||||||
output
|
output
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Deals with any unclosed elements on the stack.
|
||||||
|
fn finish_stack(&mut self) {
|
||||||
|
while let Some(node_id) = self.tag_stack.pop() {
|
||||||
|
let node = self.tree.get(node_id).unwrap().value();
|
||||||
|
match node {
|
||||||
|
Node::Fragment => {}
|
||||||
|
Node::Element(el) => {
|
||||||
|
if el.was_raw {
|
||||||
|
warn!(
|
||||||
|
"unclosed HTML tag `<{}>` found in `{}`",
|
||||||
|
el.name.local,
|
||||||
|
self.options.path.display()
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
panic!(
|
||||||
|
"internal error: expected empty tag stack.\n
|
||||||
|
path: `{}`\n\
|
||||||
|
element={el:?}",
|
||||||
|
self.options.path.display()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
node => {
|
||||||
|
panic!(
|
||||||
|
"internal error: expected empty tag stack.\n
|
||||||
|
path: `{}`\n\
|
||||||
|
node={node:?}",
|
||||||
|
self.options.path.display()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Appends a new footnote reference.
|
/// Appends a new footnote reference.
|
||||||
fn footnote_reference(&mut self, name: CowStr<'event>) {
|
fn footnote_reference(&mut self, name: CowStr<'event>) {
|
||||||
let len = self.footnote_numbers.len() + 1;
|
let len = self.footnote_numbers.len() + 1;
|
||||||
|
|
|
||||||
|
|
@ -239,3 +239,47 @@ fn html_blocks() {
|
||||||
fn code_block_fenced_with_indent() {
|
fn code_block_fenced_with_indent() {
|
||||||
BookTest::from_dir("rendering/code_blocks_fenced_with_indent").check_all_main_files();
|
BookTest::from_dir("rendering/code_blocks_fenced_with_indent").check_all_main_files();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Unclosed HTML tags.
|
||||||
|
//
|
||||||
|
// Note that the HTML parsing algorithm is much more complicated than what
|
||||||
|
// this is checking.
|
||||||
|
#[test]
|
||||||
|
fn unclosed_html_tags() {
|
||||||
|
BookTest::init(|_| {})
|
||||||
|
.change_file("src/chapter_1.md", "<div>x<span>foo<i>xyz")
|
||||||
|
.run("build", |cmd| {
|
||||||
|
cmd.expect_stderr(str![[r#"
|
||||||
|
INFO Book building has started
|
||||||
|
INFO Running the html backend
|
||||||
|
WARN unclosed HTML tag `<i>` found in `chapter_1.md`
|
||||||
|
WARN unclosed HTML tag `<span>` found in `chapter_1.md`
|
||||||
|
WARN unclosed HTML tag `<div>` found in `chapter_1.md`
|
||||||
|
INFO HTML book written to `[ROOT]/book`
|
||||||
|
|
||||||
|
"#]]);
|
||||||
|
})
|
||||||
|
.check_main_file(
|
||||||
|
"book/chapter_1.html",
|
||||||
|
str!["<div>x<span>foo<i>xyz</i></span></div>"],
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test for HTML tags out of sync.
|
||||||
|
#[test]
|
||||||
|
fn unbalanced_html_tags() {
|
||||||
|
BookTest::init(|_| {})
|
||||||
|
.change_file("src/chapter_1.md", "<div>x<span>foo</div></span>")
|
||||||
|
.run("build", |cmd| {
|
||||||
|
cmd.expect_stderr(str![[r#"
|
||||||
|
INFO Book building has started
|
||||||
|
INFO Running the html backend
|
||||||
|
WARN unexpected HTML end tag `</div>` found in `chapter_1.md`
|
||||||
|
Check that the HTML tags are properly balanced.
|
||||||
|
WARN unclosed HTML tag `<div>` found in `chapter_1.md`
|
||||||
|
INFO HTML book written to `[ROOT]/book`
|
||||||
|
|
||||||
|
"#]]);
|
||||||
|
})
|
||||||
|
.check_main_file("book/chapter_1.html", str!["<div>x<span>foo</span></div>"]);
|
||||||
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue