From a043f641ea6591f9b2faace25d82ecb34159d9a6 Mon Sep 17 00:00:00 2001 From: Jonathan Strong Date: Wed, 11 Mar 2020 04:14:45 -0400 Subject: [PATCH] tantivy indexing, doesn't crash, but index produced is corrupt somehow --- Cargo.toml | 5 ++ components/search/Cargo.toml | 10 +++ components/search/src/lib.rs | 149 ++++++++++++++++++++++++++++++++++- components/site/src/lib.rs | 6 +- src/cli.rs | 50 +++++++----- src/cmd/index.rs | 30 ++++--- 6 files changed, 217 insertions(+), 33 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index cd164f3..1839d67 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -34,12 +34,14 @@ ws = "0.9" ctrlc = "3" open = "1.2" globset = "0.4" +tantivy = { version = "0.12", optional = true } site = { path = "components/site" } errors = { path = "components/errors" } front_matter = { path = "components/front_matter" } utils = { path = "components/utils" } rebuild = { path = "components/rebuild" } +search = { path = "components/search" } [workspace] members = [ @@ -57,6 +59,9 @@ members = [ "components/library", ] +[features] +tantivy-indexing = ["tantivy", "search/tantivy-indexing"] + [profile.release] lto = true codegen-units = 1 diff --git a/components/search/Cargo.toml b/components/search/Cargo.toml index 1b08648..209bf22 100644 --- a/components/search/Cargo.toml +++ b/components/search/Cargo.toml @@ -8,6 +8,16 @@ edition = "2018" elasticlunr-rs = "2" ammonia = "3" lazy_static = "1" +tantivy = { version = "0.12", optional = true } +isolang = { version = "1.0", optional = true } +serde = { version = "1.0", optional = true } +serde_derive = { version = "1.0", optional = true } +serde_json = { version = "1.0", optional = true } +num_cpus = { version = "1.12", optional = true } errors = { path = "../errors" } library = { path = "../library" } + +[features] +default = [] +tantivy-indexing = ["isolang", "serde", "serde_derive", "serde_json", "num_cpus", "tantivy"] diff --git a/components/search/src/lib.rs b/components/search/src/lib.rs index 76eee5d..8857c82 100644 --- a/components/search/src/lib.rs +++ b/components/search/src/lib.rs @@ -3,7 +3,7 @@ use std::collections::{HashMap, HashSet}; use elasticlunr::{Index, Language}; use lazy_static::lazy_static; -use errors::{bail, Result}; +use errors::{bail, Result, Error}; use library::{Library, Section}; pub const ELASTICLUNR_JS: &str = include_str!("elasticlunr.min.js"); @@ -79,3 +79,150 @@ fn add_section_to_index(index: &mut Index, section: &Section, library: &Library) ); } } + + +#[cfg(feature = "tantivy-indexing")] +fn parse_language(lang: &str) -> Option { + use serde_derive::Deserialize; + #[derive(Deserialize)] + struct Lang { + pub language: tantivy::tokenizer::Language, + } + + // expecting two-character code, but will try other forms as fallback + match lang.len() { + 2 => isolang::Language::from_639_1(&lang.to_lowercase()) + .and_then(|parsed| { + let json = format!("{{\"language\":\"{}\"}}", parsed.to_name()); + serde_json::from_str::(&json).ok().map(|Lang { language }| language) + }), + + 3 => isolang::Language::from_639_3(&lang.to_lowercase()) + .and_then(|parsed| { + serde_json::from_str::(parsed.to_name()).ok() + }), + + // apparently not a code, so this is best available option + _ => serde_json::from_str::(lang).ok() + } +} + +#[cfg(feature = "tantivy-indexing")] +pub fn build_tantivy_index( + lang: &str, + library: &Library, + output_dir: &str, + //skip_section_pages: bool, + +) -> Result<()> { + + use tantivy::{schema::*, tokenizer::*, Index, Document}; + use tantivy::doc; + + let parsed_lang: Language = parse_language(lang) + .ok_or_else(|| { Error::from(format!("failed to parse language: '{}'", lang)) })?; + + let tokenizer_name: String = match parsed_lang { + Language::English => "en_stem".to_string(), + other => format!("{:?}_stem", parsed_lang).to_lowercase(), + }; + + let mut text_indexing_options = TextFieldIndexing::default() + .set_index_option(IndexRecordOption::WithFreqsAndPositions) + .set_tokenizer(&tokenizer_name); + + let text_options = TextOptions::default() + .set_indexing_options(text_indexing_options) + .set_stored(); + + let mut schema = SchemaBuilder::new(); + + let title = schema.add_text_field("title", text_options.clone()); + //let body = schema.add_text_field("body", text_options.clone()); + let permalink = schema.add_text_field("permalink", STORED); + + let schema = schema.build(); + + let index_dir = std::path::Path::new(output_dir).join("tantivy-index"); + //utils::fs::ensure_directory_exists(&index_dir)?; + + let mut index = Index::create_in_dir(&index_dir, schema.clone()) + .map_err(|e| { Error::from(format!("creating tantivy index failed: {}", e)) })?; + + + if index.tokenizers().get(&tokenizer_name).is_none() { // if non-english, we need to register stemmer + let tokenizer = TextAnalyzer::from(SimpleTokenizer) + .filter(RemoveLongFilter::limit(40)) + .filter(LowerCaser) + .filter(Stemmer::new(parsed_lang)); + index.tokenizers().register(&tokenizer_name, tokenizer); + } + + //let mut wtr = index.writer_with_num_threads(num_cpus::get_physical(), 1024 * 1024 * 256) + //let mut wtr = index.writer_with_num_threads(4, 1024 * 1024 * 256) + let mut wtr = index.writer(1024 * 1024 * 256) + .map_err(|e| { Error::from(format!("creating tantivy index writer failed: {}", e)) })?; + + //index_writer.set_merge_policy(Box::new(NoMergePolicy)); + + //let mut sections_it = library.sections_values().iter().filter(|s| s.lang == lang && s.meta.in_search_index); + + let mut n_indexed = 0; + //let group_size = 100_000; + + for section in library.sections_values() { + if section.lang != lang { continue } + + if ! section.meta.in_search_index { continue } + + // Don't index redirecting sections + //if section.meta.redirect_to.is_none() { + // index.add_doc( + // §ion.permalink, + // &[ + // §ion.meta.title.clone().unwrap_or_default(), + // &AMMONIA.clean(§ion.content).to_string(), + // ], + // ); + //} + + for _ in 0..16 { + for key in §ion.pages { + let page = library.get_page_by_key(*key); + + if !page.meta.in_search_index { + continue; + } + + //let mut doc = Document::default(); + //doc.add(FieldValue::new(title, Value::from(page.meta.title.as_ref().map(|x| x.as_str()).unwrap_or("")))); + + let cleaned_body: String = AMMONIA.clean(&page.content).to_string(); + //doc.add(FieldValue::new(body, Value::from(cleaned_body.as_str()))); + + //doc.add(FieldValue::new(permalink, Value::from(page.permalink.as_str()))); + + let opstamp = wtr.add_document(doc!( + title => page.meta.title.as_ref().map(|x| x.as_str()).unwrap_or(""), + //body => cleaned_body.as_str(), + permalink => page.permalink.as_str(), + )); + println!("added {:?} {}", opstamp, page.permalink); + + n_indexed += 1; + + //if n_indexed % group_size == 0 { } + } + } + } + + wtr.prepare_commit().map_err(|e| { Error::from(format!("tantivy IndexWriter::commit failed: {}", e)) })?; + let commit_opstamp = wtr.commit().map_err(|e| { Error::from(format!("tantivy IndexWriter::commit failed: {}", e)) })?; + println!("committed {:?}", commit_opstamp); + wtr.wait_merging_threads().map_err(|e| { Error::from(format!("tantivy IndexWriter::wait_merging_threads failed: {}", e)) })?; + drop(index); + + Ok(()) +} + + diff --git a/components/site/src/lib.rs b/components/site/src/lib.rs index eaba38e..86cf0ee 100644 --- a/components/site/src/lib.rs +++ b/components/site/src/lib.rs @@ -54,7 +54,7 @@ impl Site { config.load_extra_syntaxes(path)?; let tpl_glob = - format!("{}/{}", path.to_string_lossy().replace("\\", "/"), "templates/**/*.*ml"); + format!("{}/{}", path.to_string_lossy().replace("\\", "/"), "templates/**/*.*ml"); // " // Only parsing as we might be extending templates from themes and that would error // as we haven't loaded them yet let mut tera = @@ -71,7 +71,7 @@ impl Site { let theme_tpl_glob = format!( "{}/{}", - path.to_string_lossy().replace("\\", "/"), + path.to_string_lossy().replace("\\", "/"), // " format!("themes/{}/templates/**/*.*ml", theme) ); let mut tera_theme = Tera::parse(&theme_tpl_glob) @@ -165,7 +165,7 @@ impl Site { /// Reads all .md files in the `content` directory and create pages/sections /// out of them pub fn load(&mut self) -> Result<()> { - let base_path = self.base_path.to_string_lossy().replace("\\", "/"); + let base_path = self.base_path.to_string_lossy().replace("\\", "/"); // " let content_glob = format!("{}/{}", base_path, "content/**/*.md"); let (section_entries, page_entries): (Vec<_>, Vec<_>) = glob(&content_glob) diff --git a/src/cli.rs b/src/cli.rs index 5feef1c..be2957b 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -98,26 +98,36 @@ pub fn build_cli() -> App<'static, 'static> { ]), SubCommand::with_name("index") .about("Create a search index as a stand-alone task, and with additional options") - .args(&[ - Arg::with_name("index_type") - .long("index-type") - .short("t") - .takes_value(true) - .possible_values(&["elasticlunr", "tantivy"]) - .required(true) - .help("what kind of search index to build"), - Arg::with_name("output_dir") - .short("o") - .long("output-dir") - .default_value("public") - .takes_value(true) - .help("Outputs the generated search index files into the provided dir. \ - Note: Tantivy indexing produces a directory instead of a file, \ - which will be located at output-dir/tantivy-index"), - Arg::with_name("drafts") - .long("drafts") + .args({ + let drafts = Arg::with_name("drafts") .long("drafts") .takes_value(false) - .help("Include drafts when loading the site"), - ]), + .help("Include drafts when loading the site"); + + #[cfg(feature = "tantivy-indexing")] + { + + let index_type = Arg::with_name("index_type") + .long("index-type") + .short("t") + .takes_value(true) + .possible_values(&["elasticlunr", "tantivy"]) + .required(true) + .help("what kind of search index to build"); + let output_dir = Arg::with_name("output_dir") + .short("o") + .long("output-dir") + .default_value("public") + .takes_value(true) + .help("Outputs the generated search index files into the provided dir. \ + Note: Tantivy indexing produces a directory instead of a file, \ + which will be located at output-dir/tantivy-index"); + &[drafts, index_type, output_dir] + } + + #[cfg(not(feature = "tantivy-indexing"))] + { + &[drafts] + } + }), ]) } diff --git a/src/cmd/index.rs b/src/cmd/index.rs index 4d0c062..7d6b659 100644 --- a/src/cmd/index.rs +++ b/src/cmd/index.rs @@ -30,16 +30,28 @@ pub fn index( // somehow impacting the search indexing? doesn't seem like // it could, but maybe - match index_type { - "elasticlunr" => { - site.build_search_index()?; + #[cfg(feature = "tantivy-indexing")] + { + match index_type { + "elasticlunr" => { + site.build_search_index()?; + } + + "tantivy" => { + //if ! Path::new(output_dir).exists() { + // std::fs::create_dir_all(output_dir)?; + //} + let index_dir = Path::new(output_dir).join("tantivy-index"); + utils::fs::ensure_directory_exists(&index_dir)?; + + let lang = &site.config.default_language; + let library = site.library.read().unwrap(); // unwrap originally in Site::build_search_index, just parroting here, no idea if safe + + search::build_tantivy_index(lang, &library, output_dir)?; + } + + _ => unreachable!() } - - "tantivy" => { - unimplemented!() - } - - _ => unreachable!() } Ok(())