Merge pull request #2924 from ehuss/html-end-tags
Add better handling for unbalanced HTML tags
This commit is contained in:
commit
7d1566860c
2 changed files with 132 additions and 44 deletions
|
|
@ -19,7 +19,7 @@ use pulldown_cmark::{Alignment, CodeBlockKind, CowStr, Event, LinkType, Tag, Tag
|
|||
use std::borrow::Cow;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::ops::Deref;
|
||||
use tracing::{error, trace, warn};
|
||||
use tracing::{trace, warn};
|
||||
|
||||
/// Helper to create a [`QualName`].
|
||||
macro_rules! attr_qual_name {
|
||||
|
|
@ -307,6 +307,8 @@ where
|
|||
match event {
|
||||
Event::Start(tag) => self.start_tag(tag),
|
||||
Event::End(tag) => {
|
||||
// TODO: This should validate that the event stack is
|
||||
// properly synchronized with the tag stack.
|
||||
self.pop();
|
||||
match tag {
|
||||
TagEnd::TableHead => {
|
||||
|
|
@ -378,6 +380,7 @@ where
|
|||
}
|
||||
}
|
||||
}
|
||||
self.finish_stack();
|
||||
self.collect_footnote_defs();
|
||||
}
|
||||
|
||||
|
|
@ -606,40 +609,10 @@ where
|
|||
trace!("html token={token:?}");
|
||||
match token {
|
||||
Token::DoctypeToken(_) => {}
|
||||
Token::TagToken(tag) => {
|
||||
match tag.kind {
|
||||
TagKind::StartTag => {
|
||||
let is_closed = is_void_element(&tag.name) || tag.self_closing;
|
||||
is_raw = matches!(&*tag.name, "script" | "style");
|
||||
let name = QualName::new(None, html5ever::ns!(html), tag.name);
|
||||
let attrs = tag
|
||||
.attrs
|
||||
.into_iter()
|
||||
.map(|attr| (attr.name, attr.value))
|
||||
.collect();
|
||||
let mut el = Element {
|
||||
name,
|
||||
attrs,
|
||||
self_closing: tag.self_closing,
|
||||
was_raw: true,
|
||||
};
|
||||
fix_html_link(&mut el);
|
||||
self.push(Node::Element(el));
|
||||
if is_closed {
|
||||
// No end element.
|
||||
self.pop();
|
||||
}
|
||||
}
|
||||
TagKind::EndTag => {
|
||||
is_raw = false;
|
||||
if self.is_html_tag_matching(&tag.name) {
|
||||
self.pop();
|
||||
}
|
||||
// else the stack is corrupt. I'm not really sure
|
||||
// what to do here...
|
||||
}
|
||||
}
|
||||
}
|
||||
Token::TagToken(tag) => match tag.kind {
|
||||
TagKind::StartTag => self.start_html_tag(tag, &mut is_raw),
|
||||
TagKind::EndTag => self.end_html_tag(tag, &mut is_raw),
|
||||
},
|
||||
Token::CommentToken(comment) => {
|
||||
self.append(Node::Comment(comment));
|
||||
}
|
||||
|
|
@ -664,23 +637,60 @@ where
|
|||
}
|
||||
}
|
||||
|
||||
/// Adds an open HTML tag.
|
||||
fn start_html_tag(&mut self, tag: html5ever::tokenizer::Tag, is_raw: &mut bool) {
|
||||
let is_closed = is_void_element(&tag.name) || tag.self_closing;
|
||||
*is_raw = matches!(&*tag.name, "script" | "style");
|
||||
let name = QualName::new(None, html5ever::ns!(html), tag.name);
|
||||
let attrs = tag
|
||||
.attrs
|
||||
.into_iter()
|
||||
.map(|attr| (attr.name, attr.value))
|
||||
.collect();
|
||||
let mut el = Element {
|
||||
name,
|
||||
attrs,
|
||||
self_closing: tag.self_closing,
|
||||
was_raw: true,
|
||||
};
|
||||
fix_html_link(&mut el);
|
||||
self.push(Node::Element(el));
|
||||
if is_closed {
|
||||
// No end element.
|
||||
self.pop();
|
||||
}
|
||||
}
|
||||
|
||||
/// Closes the given HTML tag.
|
||||
fn end_html_tag(&mut self, tag: html5ever::tokenizer::Tag, is_raw: &mut bool) {
|
||||
*is_raw = false;
|
||||
if self.is_html_tag_matching(&tag.name) {
|
||||
self.pop();
|
||||
} else {
|
||||
// The proper thing to do here is to recover. However, the HTML
|
||||
// parsing algorithm for that is quite complex. See
|
||||
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inbody
|
||||
// and the adoption agency algorithm.
|
||||
warn!(
|
||||
"unexpected HTML end tag `</{}>` found in `{}`\n\
|
||||
Check that the HTML tags are properly balanced.",
|
||||
tag.name,
|
||||
self.options.path.display()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// This is used to verify HTML parsing keeps the stack of tags in sync.
|
||||
fn is_html_tag_matching(&self, name: &str) -> bool {
|
||||
let current = self.tree.get(self.current_node).unwrap().value();
|
||||
if let Node::Element(el) = current
|
||||
&& el.name() == name
|
||||
{
|
||||
return true;
|
||||
}
|
||||
error!(
|
||||
"internal error: HTML tag stack out of sync.\n
|
||||
path: `{}`\n\
|
||||
current={current:?}\n\
|
||||
pop name: {name}",
|
||||
self.options.path.display()
|
||||
);
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
/// Eats all pulldown-cmark events until the next `End` matching the
|
||||
/// current nesting level.
|
||||
|
|
@ -736,6 +746,40 @@ where
|
|||
output
|
||||
}
|
||||
|
||||
/// Deals with any unclosed elements on the stack.
|
||||
fn finish_stack(&mut self) {
|
||||
while let Some(node_id) = self.tag_stack.pop() {
|
||||
let node = self.tree.get(node_id).unwrap().value();
|
||||
match node {
|
||||
Node::Fragment => {}
|
||||
Node::Element(el) => {
|
||||
if el.was_raw {
|
||||
warn!(
|
||||
"unclosed HTML tag `<{}>` found in `{}`",
|
||||
el.name.local,
|
||||
self.options.path.display()
|
||||
);
|
||||
} else {
|
||||
panic!(
|
||||
"internal error: expected empty tag stack.\n
|
||||
path: `{}`\n\
|
||||
element={el:?}",
|
||||
self.options.path.display()
|
||||
);
|
||||
}
|
||||
}
|
||||
node => {
|
||||
panic!(
|
||||
"internal error: expected empty tag stack.\n
|
||||
path: `{}`\n\
|
||||
node={node:?}",
|
||||
self.options.path.display()
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Appends a new footnote reference.
|
||||
fn footnote_reference(&mut self, name: CowStr<'event>) {
|
||||
let len = self.footnote_numbers.len() + 1;
|
||||
|
|
|
|||
|
|
@ -239,3 +239,47 @@ fn html_blocks() {
|
|||
fn code_block_fenced_with_indent() {
|
||||
BookTest::from_dir("rendering/code_blocks_fenced_with_indent").check_all_main_files();
|
||||
}
|
||||
|
||||
// Unclosed HTML tags.
|
||||
//
|
||||
// Note that the HTML parsing algorithm is much more complicated than what
|
||||
// this is checking.
|
||||
#[test]
|
||||
fn unclosed_html_tags() {
|
||||
BookTest::init(|_| {})
|
||||
.change_file("src/chapter_1.md", "<div>x<span>foo<i>xyz")
|
||||
.run("build", |cmd| {
|
||||
cmd.expect_stderr(str![[r#"
|
||||
INFO Book building has started
|
||||
INFO Running the html backend
|
||||
WARN unclosed HTML tag `<i>` found in `chapter_1.md`
|
||||
WARN unclosed HTML tag `<span>` found in `chapter_1.md`
|
||||
WARN unclosed HTML tag `<div>` found in `chapter_1.md`
|
||||
INFO HTML book written to `[ROOT]/book`
|
||||
|
||||
"#]]);
|
||||
})
|
||||
.check_main_file(
|
||||
"book/chapter_1.html",
|
||||
str!["<div>x<span>foo<i>xyz</i></span></div>"],
|
||||
);
|
||||
}
|
||||
|
||||
// Test for HTML tags out of sync.
|
||||
#[test]
|
||||
fn unbalanced_html_tags() {
|
||||
BookTest::init(|_| {})
|
||||
.change_file("src/chapter_1.md", "<div>x<span>foo</div></span>")
|
||||
.run("build", |cmd| {
|
||||
cmd.expect_stderr(str![[r#"
|
||||
INFO Book building has started
|
||||
INFO Running the html backend
|
||||
WARN unexpected HTML end tag `</div>` found in `chapter_1.md`
|
||||
Check that the HTML tags are properly balanced.
|
||||
WARN unclosed HTML tag `<div>` found in `chapter_1.md`
|
||||
INFO HTML book written to `[ROOT]/book`
|
||||
|
||||
"#]]);
|
||||
})
|
||||
.check_main_file("book/chapter_1.html", str!["<div>x<span>foo</span></div>"]);
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue