//! Markdown processing used in mdBook. //! //! This crate provides functions for processing Markdown in the same way as //! [mdBook](https://rust-lang.github.io/mdBook/). The [`pulldown_cmark`] //! crate is used as the underlying parser. This crate re-exports //! [`pulldown_cmark`] so that you can access its types. //! //! The parser in this library adds several modifications to the //! [`pulldown_cmark`] event stream. For example, it adjusts some links, //! modifies the behavior of footnotes, and adds various HTML wrappers. use pulldown_cmark::{CodeBlockKind, CowStr, Event, Options, Parser, Tag, TagEnd, html}; use regex::Regex; use std::collections::HashMap; use std::fmt::Write; use std::path::Path; use std::sync::LazyLock; use tracing::warn; #[doc(inline)] pub use pulldown_cmark; #[cfg(test)] mod tests; /// Options for parsing markdown. #[non_exhaustive] pub struct MarkdownOptions { /// Enables smart punctuation. /// /// Converts quotes to curly quotes, `...` to `…`, `--` to en-dash, and /// `---` to em-dash. /// /// This is `true` by default. pub smart_punctuation: bool, } impl Default for MarkdownOptions { fn default() -> MarkdownOptions { MarkdownOptions { smart_punctuation: true, } } } /// Options for converting markdown to HTML. #[non_exhaustive] pub struct HtmlRenderOptions<'a> { /// Options for parsing markdown. pub markdown_options: MarkdownOptions, /// The chapter's location, relative to the `SUMMARY.md` file. pub path: &'a Path, /// If true, render for the print page. pub for_print: bool, } impl<'a> HtmlRenderOptions<'a> { /// Creates a new [`HtmlRenderOptions`]. pub fn new(path: &'a Path) -> HtmlRenderOptions<'a> { HtmlRenderOptions { markdown_options: MarkdownOptions::default(), path, for_print: false, } } } /// Creates a new pulldown-cmark parser of the given text. pub fn new_cmark_parser<'text>(text: &'text str, options: &MarkdownOptions) -> Parser<'text> { let mut opts = Options::empty(); opts.insert(Options::ENABLE_TABLES); opts.insert(Options::ENABLE_FOOTNOTES); opts.insert(Options::ENABLE_STRIKETHROUGH); opts.insert(Options::ENABLE_TASKLISTS); opts.insert(Options::ENABLE_HEADING_ATTRIBUTES); if options.smart_punctuation { opts.insert(Options::ENABLE_SMART_PUNCTUATION); } Parser::new_ext(text, opts) } /// Renders markdown to HTML. pub fn render_markdown(text: &str, options: &HtmlRenderOptions<'_>) -> String { let mut body = String::with_capacity(text.len() * 3 / 2); // Based on // https://github.com/pulldown-cmark/pulldown-cmark/blob/master/pulldown-cmark/examples/footnote-rewrite.rs // This handling of footnotes is a two-pass process. This is done to // support linkbacks, little arrows that allow you to jump back to the // footnote reference. The first pass collects the footnote definitions. // The second pass modifies those definitions to include the linkbacks, // and inserts the definitions back into the `events` list. // This is a map of name -> (number, count) // `name` is the name of the footnote. // `number` is the footnote number displayed in the output. // `count` is the number of references to this footnote (used for multiple // linkbacks, and checking for unused footnotes). let mut footnote_numbers = HashMap::new(); // This is a map of name -> Vec // `name` is the name of the footnote. // The events list is the list of events needed to build the footnote definition. let mut footnote_defs = HashMap::new(); // The following are used when currently processing a footnote definition. // // This is the name of the footnote (escaped). let mut in_footnote_name = String::new(); // This is the list of events to build the footnote definition. let mut in_footnote = Vec::new(); // This is used to add space between consecutive footnotes. I was unable // to figure out a way to do this just with pure CSS. let mut prev_was_footnote = false; let events = new_cmark_parser(text, &options.markdown_options) .map(clean_codeblock_headers) .map(|event| adjust_links(event, options)) .flat_map(|event| { let (a, b) = wrap_tables(event); a.into_iter().chain(b) }) // Footnote rewriting must go last to ensure inner definition contents // are processed (since they get pulled out of the initial stream). .filter_map(|event| { match event { Event::Start(Tag::FootnoteDefinition(name)) => { prev_was_footnote = false; if !in_footnote.is_empty() { warn!( "internal bug: nested footnote not expected in {:?}", options.path ); } in_footnote_name = special_escape(&name); None } Event::End(TagEnd::FootnoteDefinition) => { let def_events = std::mem::take(&mut in_footnote); let name = std::mem::take(&mut in_footnote_name); if footnote_defs.contains_key(&name) { warn!( "footnote `{name}` in {} defined multiple times - \ not updating to new definition", options.path.display() ); } else { footnote_defs.insert(name, def_events); } None } Event::FootnoteReference(name) => { let name = special_escape(&name); let len = footnote_numbers.len() + 1; let (n, count) = footnote_numbers.entry(name.clone()).or_insert((len, 0)); *count += 1; let mut html = String::new(); if prev_was_footnote { write!(html, " ").unwrap(); } prev_was_footnote = true; write!( html, "\ {n}\ " ) .unwrap(); let html = Event::Html(html.into()); if in_footnote_name.is_empty() { Some(html) } else { // While inside a footnote, we need to accumulate. in_footnote.push(html); None } } // While inside a footnote, accumulate all events into a local. _ if !in_footnote_name.is_empty() => { in_footnote.push(event); prev_was_footnote = false; None } _ => { prev_was_footnote = false; Some(event) } } }); html::push_html(&mut body, events); if !footnote_defs.is_empty() { add_footnote_defs( &mut body, options, footnote_defs.into_iter().collect(), &footnote_numbers, ); } body } /// Adds all footnote definitions into `body`. fn add_footnote_defs( body: &mut String, options: &HtmlRenderOptions<'_>, mut defs: Vec<(String, Vec>)>, numbers: &HashMap, ) { // Remove unused. defs.retain(|(name, _)| { if !numbers.contains_key(name) { warn!( "footnote `{name}` in `{}` is defined but not referenced", options.path.display() ); false } else { true } }); defs.sort_by_cached_key(|(name, _)| numbers[name].0); body.push_str( "
\n\
    ", ); // Insert the backrefs to the definition, and put the definitions in the output. for (name, mut fn_events) in defs { let count = numbers[&name].1; fn_events.insert( 0, Event::Html(format!("
  1. ").into()), ); // Generate the linkbacks. for usage in 1..=count { let nth = if usage == 1 { String::new() } else { usage.to_string() }; let backlink = Event::Html(format!(" ↩{nth}").into()); if matches!(fn_events.last(), Some(Event::End(TagEnd::Paragraph))) { // Put the linkback at the end of the last paragraph instead // of on a line by itself. fn_events.insert(fn_events.len() - 1, backlink); } else { // Not a clear place to put it in this circumstance, so put it // at the end. fn_events.push(backlink); } } fn_events.push(Event::Html("
  2. \n".into())); html::push_html(body, fn_events.into_iter()); } body.push_str("
"); } /// Wraps tables in a `.table-wrapper` class to apply overflow-x rules to. fn wrap_tables(event: Event<'_>) -> (Option>, Option>) { match event { Event::Start(Tag::Table(_)) => ( Some(Event::Html(r#"
"#.into())), Some(event), ), Event::End(TagEnd::Table) => (Some(event), Some(Event::Html(r#"
"#.into()))), _ => (Some(event), None), } } fn clean_codeblock_headers(event: Event<'_>) -> Event<'_> { match event { Event::Start(Tag::CodeBlock(CodeBlockKind::Fenced(ref info))) => { let info: String = info .chars() .map(|x| match x { ' ' | '\t' => ',', _ => x, }) .filter(|ch| !ch.is_whitespace()) .collect(); Event::Start(Tag::CodeBlock(CodeBlockKind::Fenced(CowStr::from(info)))) } _ => event, } } /// Fix links to the correct location. /// /// This adjusts links, such as turning `.md` extensions to `.html`. /// /// `path` is the path to the page being rendered relative to the root of the /// book. This is used for the `print.html` page so that links on the print /// page go to the original location. Normal page rendering sets `path` to /// None. Ideally, print page links would link to anchors on the print page, /// but that is very difficult. fn adjust_links<'a>(event: Event<'a>, options: &HtmlRenderOptions<'_>) -> Event<'a> { static SCHEME_LINK: LazyLock = LazyLock::new(|| Regex::new(r"^[a-z][a-z0-9+.-]*:").unwrap()); static MD_LINK: LazyLock = LazyLock::new(|| Regex::new(r"(?P.*)\.md(?P#.*)?").unwrap()); fn fix<'a>(dest: CowStr<'a>, options: &HtmlRenderOptions<'_>) -> CowStr<'a> { if dest.starts_with('#') { // Fragment-only link. if options.for_print { let mut base = options.path.display().to_string(); if base.ends_with(".md") { base.replace_range(base.len() - 3.., ".html"); } return format!("{base}{dest}").into(); } else { return dest; } } // Don't modify links with schemes like `https`. if !SCHEME_LINK.is_match(&dest) { // This is a relative link, adjust it as necessary. let mut fixed_link = String::new(); if options.for_print { let base = options .path .parent() .expect("path can't be empty") .to_str() .expect("utf-8 paths only"); if !base.is_empty() { write!(fixed_link, "{base}/").unwrap(); } } if let Some(caps) = MD_LINK.captures(&dest) { fixed_link.push_str(&caps["link"]); fixed_link.push_str(".html"); if let Some(anchor) = caps.name("anchor") { fixed_link.push_str(anchor.as_str()); } } else { fixed_link.push_str(&dest); }; return CowStr::from(fixed_link); } dest } fn fix_html<'a>(html: CowStr<'a>, options: &HtmlRenderOptions<'_>) -> CowStr<'a> { // This is a terrible hack, but should be reasonably reliable. Nobody // should ever parse a tag with a regex. However, there isn't anything // in Rust that I know of that is suitable for handling partial html // fragments like those generated by pulldown_cmark. // // There are dozens of HTML tags/attributes that contain paths, so // feel free to add more tags if desired; these are the only ones I // care about right now. static HTML_LINK: LazyLock = LazyLock::new(|| Regex::new(r#"(<(?:a|img) [^>]*?(?:src|href)=")([^"]+?)""#).unwrap()); HTML_LINK .replace_all(&html, |caps: ®ex::Captures<'_>| { let fixed = fix(caps[2].into(), options); format!("{}{}\"", &caps[1], fixed) }) .into_owned() .into() } match event { Event::Start(Tag::Link { link_type, dest_url, title, id, }) => Event::Start(Tag::Link { link_type, dest_url: fix(dest_url, options), title, id, }), Event::Start(Tag::Image { link_type, dest_url, title, id, }) => Event::Start(Tag::Image { link_type, dest_url: fix(dest_url, options), title, id, }), Event::Html(html) => Event::Html(fix_html(html, options)), Event::InlineHtml(html) => Event::InlineHtml(fix_html(html, options)), _ => event, } } /// Escape characters to make it safe for an HTML string. pub fn special_escape(mut s: &str) -> String { let mut escaped = String::with_capacity(s.len()); let needs_escape: &[char] = &['<', '>', '\'', '"', '\\', '&']; while let Some(next) = s.find(needs_escape) { escaped.push_str(&s[..next]); match s.as_bytes()[next] { b'<' => escaped.push_str("<"), b'>' => escaped.push_str(">"), b'\'' => escaped.push_str("'"), b'"' => escaped.push_str("""), b'\\' => escaped.push_str("\"), b'&' => escaped.push_str("&"), _ => unreachable!(), } s = &s[next + 1..]; } escaped.push_str(s); escaped }