Migrate book_creates_reasonable_search_index to BookTest

This commit is contained in:
Eric Huss 2025-04-22 09:00:47 -07:00
parent 3fce1151dd
commit cad8988f8d
12 changed files with 167 additions and 61 deletions

View file

@ -393,67 +393,6 @@ mod search {
serde_json::from_str(&index.replace("\\'", "'").replace("\\\\", "\\")).unwrap()
}
#[test]
fn book_creates_reasonable_search_index() {
let temp = DummyBook::new().build().unwrap();
let md = MDBook::load(temp.path()).unwrap();
md.build().unwrap();
let index = read_book_index(temp.path());
let doc_urls = index["doc_urls"].as_array().unwrap();
eprintln!("doc_urls={doc_urls:#?}",);
let get_doc_ref =
|url: &str| -> String { doc_urls.iter().position(|s| s == url).unwrap().to_string() };
let first_chapter = get_doc_ref("first/index.html#first-chapter");
let introduction = get_doc_ref("intro.html#introduction");
let some_section = get_doc_ref("first/index.html#some-section");
let summary = get_doc_ref("first/includes.html#summary");
let no_headers = get_doc_ref("first/no-headers.html");
let duplicate_headers_1 = get_doc_ref("first/duplicate-headers.html#header-text-1");
let conclusion = get_doc_ref("conclusion.html#conclusion");
let heading_attrs = get_doc_ref("first/heading-attributes.html#both");
let bodyidx = &index["index"]["index"]["body"]["root"];
let textidx = &bodyidx["t"]["e"]["x"]["t"];
assert_eq!(textidx["df"], 5);
assert_eq!(textidx["docs"][&first_chapter]["tf"], 1.0);
assert_eq!(textidx["docs"][&introduction]["tf"], 1.0);
let docs = &index["index"]["documentStore"]["docs"];
assert_eq!(docs[&first_chapter]["body"], "more text.");
assert_eq!(docs[&some_section]["body"], "");
assert_eq!(
docs[&summary]["body"],
"Dummy Book Introduction First Chapter Nested Chapter Includes Recursive Markdown Unicode No Headers Duplicate Headers Heading Attributes Second Chapter Nested Chapter Conclusion"
);
assert_eq!(
docs[&summary]["breadcrumbs"],
"First Chapter » Includes » Summary"
);
// See note about InlineHtml in search.rs. Ideally the `alert()` part
// should not be in the index, but we don't have a way to scrub inline
// html.
assert_eq!(docs[&conclusion]["body"], "I put <HTML> in here! Sneaky inline event alert(\"inline\");. But regular inline is indexed.");
assert_eq!(
docs[&no_headers]["breadcrumbs"],
"First Chapter » No Headers"
);
assert_eq!(
docs[&duplicate_headers_1]["breadcrumbs"],
"First Chapter » Duplicate Headers » Header Text"
);
assert_eq!(
docs[&no_headers]["body"],
"Capybara capybara capybara. Capybara capybara capybara. ThisLongWordIsIncludedSoWeCanCheckThatSufficientlyLongWordsAreOmittedFromTheSearchIndex."
);
assert_eq!(
docs[&heading_attrs]["breadcrumbs"],
"First Chapter » Heading Attributes » Heading with id and classes"
);
}
#[test]
fn can_disable_individual_chapters() {
let temp = DummyBook::new().build().unwrap();

View file

@ -15,6 +15,8 @@ mod print;
mod redirects;
mod renderer;
mod rendering;
#[cfg(feature = "search")]
mod search;
mod prelude {
pub use crate::book_test::BookTest;

78
tests/testsuite/search.rs Normal file
View file

@ -0,0 +1,78 @@
//! Tests for search support.
use crate::prelude::*;
use std::path::Path;
fn read_book_index(root: &Path) -> serde_json::Value {
let index = root.join("book/searchindex.js");
let index = std::fs::read_to_string(index).unwrap();
let index = index.trim_start_matches("window.search = JSON.parse('");
let index = index.trim_end_matches("');");
// We need unescape the string as it's supposed to be an escaped JS string.
serde_json::from_str(&index.replace("\\'", "'").replace("\\\\", "\\")).unwrap()
}
// Some spot checks for the generation of the search index.
#[test]
fn reasonable_search_index() {
let mut test = BookTest::from_dir("search/reasonable_search_index");
test.build();
let index = read_book_index(&test.dir);
let doc_urls = index["doc_urls"].as_array().unwrap();
eprintln!("doc_urls={doc_urls:#?}",);
let get_doc_ref = |url: &str| -> String {
doc_urls
.iter()
.position(|s| s == url)
.unwrap_or_else(|| panic!("failed to find {url}"))
.to_string()
};
let first_chapter = get_doc_ref("first/index.html#first-chapter");
let introduction = get_doc_ref("intro.html#introduction");
let some_section = get_doc_ref("first/index.html#some-section");
let summary = get_doc_ref("first/includes.html#summary");
let no_headers = get_doc_ref("first/no-headers.html");
let duplicate_headers_1 = get_doc_ref("first/duplicate-headers.html#header-text-1");
let heading_attrs = get_doc_ref("first/heading-attributes.html#both");
let sneaky = get_doc_ref("intro.html#sneaky");
let bodyidx = &index["index"]["index"]["body"]["root"];
let textidx = &bodyidx["t"]["e"]["x"]["t"];
assert_eq!(textidx["df"], 5);
assert_eq!(textidx["docs"][&first_chapter]["tf"], 1.0);
assert_eq!(textidx["docs"][&introduction]["tf"], 1.0);
let docs = &index["index"]["documentStore"]["docs"];
assert_eq!(docs[&first_chapter]["body"], "more text.");
assert_eq!(docs[&some_section]["body"], "");
assert_eq!(
docs[&summary]["body"],
"Introduction First Chapter Includes Unicode No Headers Duplicate Headers Heading Attributes"
);
assert_eq!(
docs[&summary]["breadcrumbs"],
"First Chapter » Includes » Summary"
);
// See note about InlineHtml in search.rs. Ideally the `alert()` part
// should not be in the index, but we don't have a way to scrub inline
// html.
assert_eq!(docs[&sneaky]["body"], "I put <HTML> in here! Sneaky inline event alert(\"inline\");. But regular inline is indexed.");
assert_eq!(
docs[&no_headers]["breadcrumbs"],
"First Chapter » No Headers"
);
assert_eq!(
docs[&duplicate_headers_1]["breadcrumbs"],
"First Chapter » Duplicate Headers » Header Text"
);
assert_eq!(
docs[&no_headers]["body"],
"Capybara capybara capybara. Capybara capybara capybara. ThisLongWordIsIncludedSoWeCanCheckThatSufficientlyLongWordsAreOmittedFromTheSearchIndex."
);
assert_eq!(
docs[&heading_attrs]["breadcrumbs"],
"First Chapter » Heading Attributes » Heading with id and classes"
);
}

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1,10 @@
# Summary
[Introduction](intro.md)
- [First Chapter](first/index.md)
- [Includes](first/includes.md)
- [Unicode](first/unicode.md)
- [No Headers](first/no-headers.md)
- [Duplicate Headers](first/duplicate-headers.md)
- [Heading Attributes](first/heading-attributes.md)

View file

@ -0,0 +1,9 @@
# Duplicate headers
This page validates behaviour of duplicate headers.
# Header Text
# Header Text
# header-text

View file

@ -0,0 +1,5 @@
# Heading Attributes {#attrs}
## Heading with classes {.class1 .class2}
## Heading with id and classes {#both .class1 .class2}

View file

@ -0,0 +1,3 @@
# Includes
{{#include ../SUMMARY.md::}}

View file

@ -0,0 +1,5 @@
# First Chapter
more text.
## Some Section

View file

@ -0,0 +1,5 @@
Capybara capybara capybara.
Capybara capybara capybara.
ThisLongWordIsIncludedSoWeCanCheckThatSufficientlyLongWordsAreOmittedFromTheSearchIndex.

View file

@ -0,0 +1,21 @@
# Unicode stress tests
Please be careful editing, this contains carefully crafted characters.
Two byte character: spatiëring
Combining character: spatiëring
Three byte character: 书こんにちは
Four byte character: 𐌀‮𐌁‮𐌂‮𐌃‮𐌄‮𐌅‮𐌆‮𐌇‮𐌈‬
Right-to-left: مرحبا
Emoticons: 🔊 😍 💜 1
right-to-left mark: hello באמת!
Zalgo: ǫ̛̖̱̗̝͈̋͒͋̏ͥͫ̒̆ͩ̏͌̾͊͐ͪ̾̚

View file

@ -0,0 +1,28 @@
# Introduction
Here's some interesting text...
## Sneaky
<p>
<!--secret secret-->
I put &lt;HTML&gt; in here!<br/>
</p>
<script type="text/javascript" >
// I probably shouldn't do this
if (3 < 5 > 10)
{
alert("The sky is falling!");
}
</script >
<style >
/*
css looks, like this {
foo: < 3 <bar >
}
*/
</style>
Sneaky inline event <script>alert("inline");</script>.
But regular <b>inline</b> is indexed.