Merge pull request #2924 from ehuss/html-end-tags

Add better handling for unbalanced HTML tags
This commit is contained in:
Eric Huss 2025-11-06 03:02:40 +00:00 committed by GitHub
commit 7d1566860c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 132 additions and 44 deletions

View file

@ -19,7 +19,7 @@ use pulldown_cmark::{Alignment, CodeBlockKind, CowStr, Event, LinkType, Tag, Tag
use std::borrow::Cow;
use std::collections::{HashMap, HashSet};
use std::ops::Deref;
use tracing::{error, trace, warn};
use tracing::{trace, warn};
/// Helper to create a [`QualName`].
macro_rules! attr_qual_name {
@ -307,6 +307,8 @@ where
match event {
Event::Start(tag) => self.start_tag(tag),
Event::End(tag) => {
// TODO: This should validate that the event stack is
// properly synchronized with the tag stack.
self.pop();
match tag {
TagEnd::TableHead => {
@ -378,6 +380,7 @@ where
}
}
}
self.finish_stack();
self.collect_footnote_defs();
}
@ -606,40 +609,10 @@ where
trace!("html token={token:?}");
match token {
Token::DoctypeToken(_) => {}
Token::TagToken(tag) => {
match tag.kind {
TagKind::StartTag => {
let is_closed = is_void_element(&tag.name) || tag.self_closing;
is_raw = matches!(&*tag.name, "script" | "style");
let name = QualName::new(None, html5ever::ns!(html), tag.name);
let attrs = tag
.attrs
.into_iter()
.map(|attr| (attr.name, attr.value))
.collect();
let mut el = Element {
name,
attrs,
self_closing: tag.self_closing,
was_raw: true,
};
fix_html_link(&mut el);
self.push(Node::Element(el));
if is_closed {
// No end element.
self.pop();
}
}
TagKind::EndTag => {
is_raw = false;
if self.is_html_tag_matching(&tag.name) {
self.pop();
}
// else the stack is corrupt. I'm not really sure
// what to do here...
}
}
}
Token::TagToken(tag) => match tag.kind {
TagKind::StartTag => self.start_html_tag(tag, &mut is_raw),
TagKind::EndTag => self.end_html_tag(tag, &mut is_raw),
},
Token::CommentToken(comment) => {
self.append(Node::Comment(comment));
}
@ -664,22 +637,59 @@ where
}
}
/// Adds an open HTML tag.
fn start_html_tag(&mut self, tag: html5ever::tokenizer::Tag, is_raw: &mut bool) {
let is_closed = is_void_element(&tag.name) || tag.self_closing;
*is_raw = matches!(&*tag.name, "script" | "style");
let name = QualName::new(None, html5ever::ns!(html), tag.name);
let attrs = tag
.attrs
.into_iter()
.map(|attr| (attr.name, attr.value))
.collect();
let mut el = Element {
name,
attrs,
self_closing: tag.self_closing,
was_raw: true,
};
fix_html_link(&mut el);
self.push(Node::Element(el));
if is_closed {
// No end element.
self.pop();
}
}
/// Closes the given HTML tag.
fn end_html_tag(&mut self, tag: html5ever::tokenizer::Tag, is_raw: &mut bool) {
*is_raw = false;
if self.is_html_tag_matching(&tag.name) {
self.pop();
} else {
// The proper thing to do here is to recover. However, the HTML
// parsing algorithm for that is quite complex. See
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inbody
// and the adoption agency algorithm.
warn!(
"unexpected HTML end tag `</{}>` found in `{}`\n\
Check that the HTML tags are properly balanced.",
tag.name,
self.options.path.display()
);
}
}
/// This is used to verify HTML parsing keeps the stack of tags in sync.
fn is_html_tag_matching(&self, name: &str) -> bool {
let current = self.tree.get(self.current_node).unwrap().value();
if let Node::Element(el) = current
&& el.name() == name
{
return true;
true
} else {
false
}
error!(
"internal error: HTML tag stack out of sync.\n
path: `{}`\n\
current={current:?}\n\
pop name: {name}",
self.options.path.display()
);
false
}
/// Eats all pulldown-cmark events until the next `End` matching the
@ -736,6 +746,40 @@ where
output
}
/// Deals with any unclosed elements on the stack.
fn finish_stack(&mut self) {
while let Some(node_id) = self.tag_stack.pop() {
let node = self.tree.get(node_id).unwrap().value();
match node {
Node::Fragment => {}
Node::Element(el) => {
if el.was_raw {
warn!(
"unclosed HTML tag `<{}>` found in `{}`",
el.name.local,
self.options.path.display()
);
} else {
panic!(
"internal error: expected empty tag stack.\n
path: `{}`\n\
element={el:?}",
self.options.path.display()
);
}
}
node => {
panic!(
"internal error: expected empty tag stack.\n
path: `{}`\n\
node={node:?}",
self.options.path.display()
);
}
}
}
}
/// Appends a new footnote reference.
fn footnote_reference(&mut self, name: CowStr<'event>) {
let len = self.footnote_numbers.len() + 1;

View file

@ -239,3 +239,47 @@ fn html_blocks() {
fn code_block_fenced_with_indent() {
BookTest::from_dir("rendering/code_blocks_fenced_with_indent").check_all_main_files();
}
// Unclosed HTML tags.
//
// Note that the HTML parsing algorithm is much more complicated than what
// this is checking.
#[test]
fn unclosed_html_tags() {
BookTest::init(|_| {})
.change_file("src/chapter_1.md", "<div>x<span>foo<i>xyz")
.run("build", |cmd| {
cmd.expect_stderr(str![[r#"
INFO Book building has started
INFO Running the html backend
WARN unclosed HTML tag `<i>` found in `chapter_1.md`
WARN unclosed HTML tag `<span>` found in `chapter_1.md`
WARN unclosed HTML tag `<div>` found in `chapter_1.md`
INFO HTML book written to `[ROOT]/book`
"#]]);
})
.check_main_file(
"book/chapter_1.html",
str!["<div>x<span>foo<i>xyz</i></span></div>"],
);
}
// Test for HTML tags out of sync.
#[test]
fn unbalanced_html_tags() {
BookTest::init(|_| {})
.change_file("src/chapter_1.md", "<div>x<span>foo</div></span>")
.run("build", |cmd| {
cmd.expect_stderr(str![[r#"
INFO Book building has started
INFO Running the html backend
WARN unexpected HTML end tag `</div>` found in `chapter_1.md`
Check that the HTML tags are properly balanced.
WARN unclosed HTML tag `<div>` found in `chapter_1.md`
INFO HTML book written to `[ROOT]/book`
"#]]);
})
.check_main_file("book/chapter_1.html", str!["<div>x<span>foo</span></div>"]);
}