use std::borrow::Cow; use std::collections::{HashMap, HashSet}; use std::path::{Path, PathBuf}; use std::sync::LazyLock; use anyhow::{Context, Result, bail}; use elasticlunr::{Index, IndexBuilder}; use log::{debug, warn}; use pulldown_cmark::*; use serde::Serialize; use crate::book::{Book, BookItem, Chapter}; use crate::config::{Search, SearchChapterSettings}; use crate::renderer::html_handlebars::StaticFiles; use crate::theme::searcher; use crate::utils; const MAX_WORD_LENGTH_TO_INDEX: usize = 80; /// Tokenizes in the same way as elasticlunr-rs (for English), but also drops long tokens. fn tokenize(text: &str) -> Vec { text.split(|c: char| c.is_whitespace() || c == '-') .filter(|s| !s.is_empty()) .map(|s| s.trim().to_lowercase()) .filter(|s| s.len() <= MAX_WORD_LENGTH_TO_INDEX) .collect() } /// Creates all files required for search. pub fn create_files( search_config: &Search, static_files: &mut StaticFiles, book: &Book, ) -> Result<()> { let mut index = IndexBuilder::new() .add_field_with_tokenizer("title", Box::new(&tokenize)) .add_field_with_tokenizer("body", Box::new(&tokenize)) .add_field_with_tokenizer("breadcrumbs", Box::new(&tokenize)) .build(); let mut doc_urls = Vec::with_capacity(book.sections.len()); let chapter_configs = sort_search_config(&search_config.chapter); validate_chapter_config(&chapter_configs, book)?; for item in book.iter() { let chapter = match item { BookItem::Chapter(ch) if !ch.is_draft_chapter() => ch, _ => continue, }; if let Some(path) = settings_path(chapter) { let chapter_settings = get_chapter_settings(&chapter_configs, path); if !chapter_settings.enable.unwrap_or(true) { continue; } } render_item(&mut index, search_config, &mut doc_urls, chapter)?; } let index = write_to_json(index, search_config, doc_urls)?; debug!("Writing search index ✓"); if index.len() > 10_000_000 { warn!("search index is very large ({} bytes)", index.len()); } if search_config.copy_js { static_files.add_builtin( "searchindex.js", // To reduce the size of the generated JSON by preventing all `"` characters to be // escaped, we instead surround the string with much less common `'` character. format!( "window.search = Object.assign(window.search, JSON.parse('{}'));", index.replace("\\", "\\\\").replace("'", "\\'") ) .as_bytes(), ); static_files.add_builtin("searcher.js", searcher::JS); static_files.add_builtin("mark.min.js", searcher::MARK_JS); static_files.add_builtin("elasticlunr.min.js", searcher::ELASTICLUNR_JS); debug!("Copying search files ✓"); } Ok(()) } /// Uses the given arguments to construct a search document, then inserts it to the given index. fn add_doc( index: &mut Index, doc_urls: &mut Vec, anchor_base: &str, heading: &str, id_counter: &mut HashMap, section_id: &Option>, items: &[&str], ) { // Either use the explicit section id the user specified, or generate one // from the heading content. let section_id = section_id.as_ref().map(|id| id.to_string()).or_else(|| { if heading.is_empty() { // In the case where a chapter has no heading, don't set a section id. None } else { Some(utils::unique_id_from_content(heading, id_counter)) } }); let url = if let Some(id) = section_id { Cow::Owned(format!("{anchor_base}#{id}")) } else { Cow::Borrowed(anchor_base) }; let url = utils::collapse_whitespace(url.trim()); let doc_ref = doc_urls.len().to_string(); doc_urls.push(url.into()); let items = items.iter().map(|&x| utils::collapse_whitespace(x.trim())); index.add_doc(&doc_ref, items); } /// Renders markdown into flat unformatted text and adds it to the search index. fn render_item( index: &mut Index, search_config: &Search, doc_urls: &mut Vec, chapter: &Chapter, ) -> Result<()> { let chapter_path = chapter .path .as_ref() .expect("Checked that path exists above"); let filepath = Path::new(&chapter_path).with_extension("html"); let filepath = filepath .to_str() .with_context(|| "Could not convert HTML path to str")?; let anchor_base = utils::fs::normalize_path(filepath); let mut p = utils::new_cmark_parser(&chapter.content, false).peekable(); let mut in_heading = false; let max_section_depth = u32::from(search_config.heading_split_level); let mut section_id = None; let mut heading = String::new(); let mut body = String::new(); let mut breadcrumbs = chapter.parent_names.clone(); let mut footnote_numbers = HashMap::new(); breadcrumbs.push(chapter.name.clone()); let mut id_counter = HashMap::new(); while let Some(event) = p.next() { match event { Event::Start(Tag::Heading { level, id, .. }) if level as u32 <= max_section_depth => { if !heading.is_empty() { // Section finished, the next heading is following now // Write the data to the index, and clear it for the next section add_doc( index, doc_urls, &anchor_base, &heading, &mut id_counter, §ion_id, &[&heading, &body, &breadcrumbs.join(" » ")], ); heading.clear(); body.clear(); breadcrumbs.pop(); } section_id = id; in_heading = true; } Event::End(TagEnd::Heading(level)) if level as u32 <= max_section_depth => { in_heading = false; breadcrumbs.push(heading.clone()); } Event::Start(Tag::FootnoteDefinition(name)) => { let number = footnote_numbers.len() + 1; footnote_numbers.entry(name).or_insert(number); } Event::Html(html) => { let mut html_block = html.into_string(); // As of pulldown_cmark 0.6, html events are no longer contained // in an HtmlBlock tag. We must collect consecutive Html events // into a block ourselves. while let Some(Event::Html(html)) = p.peek() { html_block.push_str(html); p.next(); } body.push_str(&clean_html(&html_block)); } Event::InlineHtml(html) => { // This is not capable of cleaning inline tags like // `foo `. The `