diff --git a/components/search/src/lib.rs b/components/search/src/lib.rs index 8f7fec6..30a7e0f 100644 --- a/components/search/src/lib.rs +++ b/components/search/src/lib.rs @@ -3,6 +3,7 @@ use std::collections::{HashMap, HashSet}; use elasticlunr::{Index, Language}; use lazy_static::lazy_static; +#[allow(unused_imports)] use errors::{bail, Result, Error}; use library::{Library, Section}; @@ -29,7 +30,7 @@ lazy_static! { /// the language given /// Errors if the language given is not available in Elasticlunr /// TODO: is making `in_search_index` apply to subsections of a `false` section useful? -pub fn build_index(lang: &str, library: &Library) -> Result { +pub fn build_index(lang: &str, library: &Library) -> Result { let language = match Language::from_code(lang) { Some(l) => l, None => { @@ -45,7 +46,7 @@ pub fn build_index(lang: &str, library: &Library) -> Result { } } - Ok(index.to_json()) + Ok(index) } fn add_section_to_index(index: &mut Index, section: &Section, library: &Library) { @@ -108,13 +109,11 @@ fn parse_language(lang: &str) -> Option { } #[cfg(feature = "tantivy-indexing")] -pub fn build_tantivy_index( +pub fn build_tantivy_index>( lang: &str, library: &Library, - output_dir: &str, - //skip_section_pages: bool, - -) -> Result<()> { + index_dir: P, +) -> Result { use tantivy::{schema::*, tokenizer::*, Index, Document}; use tantivy::doc; @@ -124,10 +123,10 @@ pub fn build_tantivy_index( let tokenizer_name: String = match parsed_lang { Language::English => "en_stem".to_string(), - other => format!("{:?}_stem", parsed_lang).to_lowercase(), + other => format!("{:?}_stem", other).to_lowercase(), }; - let mut text_indexing_options = TextFieldIndexing::default() + let text_indexing_options = TextFieldIndexing::default() .set_index_option(IndexRecordOption::WithFreqsAndPositions) .set_tokenizer(&tokenizer_name); @@ -143,13 +142,9 @@ pub fn build_tantivy_index( let schema = schema.build(); - let index_dir = std::path::Path::new(output_dir).join("tantivy-index"); - //utils::fs::ensure_directory_exists(&index_dir)?; - - let mut index = Index::create_in_dir(&index_dir, schema.clone()) + let index = Index::create_in_dir(&index_dir, schema.clone()) .map_err(|e| { Error::from(format!("creating tantivy index failed: {}", e)) })?; - if index.tokenizers().get(&tokenizer_name).is_none() { // if non-english, we need to register stemmer let tokenizer = TextAnalyzer::from(SimpleTokenizer) .filter(RemoveLongFilter::limit(40)) @@ -158,72 +153,46 @@ pub fn build_tantivy_index( index.tokenizers().register(&tokenizer_name, tokenizer); } - //let mut wtr = index.writer_with_num_threads(num_cpus::get_physical(), 1024 * 1024 * 256) - //let mut wtr = index.writer_with_num_threads(4, 1024 * 1024 * 256) let mut wtr = index.writer(1024 * 1024 * 256) .map_err(|e| { Error::from(format!("creating tantivy index writer failed: {}", e)) })?; - //index_writer.set_merge_policy(Box::new(NoMergePolicy)); - - //let mut sections_it = library.sections_values().iter().filter(|s| s.lang == lang && s.meta.in_search_index); - let mut seen: HashSet = Default::default(); let mut n_indexed = 0; - //let group_size = 100_000; for section in library.sections_values() { - if section.lang != lang { continue } - if ! section.meta.in_search_index { continue } + // reason for macro: Section/Page are different types but have same attributes + macro_rules! index_page { + ($page:ident) => {{ + let already_indexed = seen.contains(&$page.permalink); + if ! already_indexed && $page.meta.in_search_index && $page.lang == lang { + seen.insert($page.permalink.clone()); // mark ask indexed + let cleaned_body: String = AMMONIA.clean(&$page.content).to_string(); + let page_doc: Document = doc!( + title => $page.meta.title.as_ref().map(|x| x.as_str()).unwrap_or(""), + body => cleaned_body.as_str(), + permalink => $page.permalink.as_str(), + ); + wtr.add_document(page_doc); + n_indexed += 1; + } + }} + } - // Don't index redirecting sections - //if section.meta.redirect_to.is_none() { - // index.add_doc( - // §ion.permalink, - // &[ - // §ion.meta.title.clone().unwrap_or_default(), - // &AMMONIA.clean(§ion.content).to_string(), - // ], - // ); - //} + if section.meta.redirect_to.is_none() { + index_page!(section); + } for key in §ion.pages { let page = library.get_page_by_key(*key); - - if !page.meta.in_search_index { continue; } - - if seen.contains(&page.permalink) { continue } - - seen.insert(page.permalink.clone()); - - //let mut doc = Document::default(); - //doc.add(FieldValue::new(title, Value::from(page.meta.title.as_ref().map(|x| x.as_str()).unwrap_or("")))); - - let cleaned_body: String = AMMONIA.clean(&page.content).to_string(); - //doc.add(FieldValue::new(body, Value::from(cleaned_body.as_str()))); - - //doc.add(FieldValue::new(permalink, Value::from(page.permalink.as_str()))); - - let opstamp = wtr.add_document(doc!( - title => page.meta.title.as_ref().map(|x| x.as_str()).unwrap_or(""), - body => cleaned_body.as_str(), - permalink => page.permalink.as_str(), - )); - println!("added {:?} {}", opstamp, page.permalink); - - n_indexed += 1; - - //if n_indexed % group_size == 0 { } + index_page!(page); } } - wtr.prepare_commit().map_err(|e| { Error::from(format!("tantivy IndexWriter::commit failed: {}", e)) })?; - let commit_opstamp = wtr.commit().map_err(|e| { Error::from(format!("tantivy IndexWriter::commit failed: {}", e)) })?; + //wtr.prepare_commit().map_err(|e| { Error::from(format!("tantivy IndexWriter::commit failed: {}", e)) })?; + wtr.commit().map_err(|e| { Error::from(format!("tantivy IndexWriter::commit failed: {}", e)) })?; wtr.wait_merging_threads().map_err(|e| { Error::from(format!("tantivy IndexWriter::wait_merging_threads failed: {}", e)) })?; drop(index); - println!("finished indexing {} pages", n_indexed); - Ok(()) + Ok(n_indexed) } - - diff --git a/components/site/src/lib.rs b/components/site/src/lib.rs index 86cf0ee..ab1890e 100644 --- a/components/site/src/lib.rs +++ b/components/site/src/lib.rs @@ -778,24 +778,34 @@ impl Site { Ok(()) } - pub fn build_search_index(&self) -> Result<()> { + pub fn build_search_index(&self) -> Result { + let mut n_indexed = 0; ensure_directory_exists(&self.output_path)?; // index first + let index = search::build_index(&self.config.default_language, &self.library.read().unwrap()) + .map_err(|e| Error::from(format!("creating elasticlunr index failed: {}", e)))?; + n_indexed += index.document_store.len(); + create_file( &self.output_path.join(&format!("search_index.{}.js", self.config.default_language)), &format!( "window.searchIndex = {};", - search::build_index(&self.config.default_language, &self.library.read().unwrap())? + index.to_json(), ), )?; for language in &self.config.languages { if language.code != self.config.default_language && language.search { + + let index = search::build_index(&self.config.default_language, &self.library.read().unwrap()) + .map_err(|e| Error::from(format!("creating elasticlunr index failed: {}", e)))?; + n_indexed += index.document_store.len(); + create_file( &self.output_path.join(&format!("search_index.{}.js", &language.code)), &format!( "window.searchIndex = {};", - search::build_index(&language.code, &self.library.read().unwrap())? + index.to_json() ), )?; } @@ -804,7 +814,7 @@ impl Site { // then elasticlunr.min.js create_file(&self.output_path.join("elasticlunr.min.js"), search::ELASTICLUNR_JS)?; - Ok(()) + Ok(n_indexed) } pub fn compile_sass(&self, base_path: &Path) -> Result<()> { diff --git a/src/cmd/index.rs b/src/cmd/index.rs index 7d6b659..0c4c61b 100644 --- a/src/cmd/index.rs +++ b/src/cmd/index.rs @@ -1,7 +1,8 @@ use std::path::Path; - -use errors::Result; +use std::time::*; +use errors::{Result, Error}; use site::Site; +use crate::console; //use crate::console; @@ -11,6 +12,7 @@ pub fn index( base_url: Option<&str>, output_dir: &str, include_drafts: bool, + #[cfg(feature = "tantivy-indexing")] index_type: &str, ) -> Result<()> { let mut site = Site::new(root_dir, config_file)?; @@ -25,34 +27,46 @@ pub fn index( site.include_drafts(); } site.load()?; + console::notify_site_size(&site); + console::warn_about_ignored_pages(&site); - // TODO: could skipping the theme and/or sass prep end up - // somehow impacting the search indexing? doesn't seem like - // it could, but maybe + let do_elastic_index = || -> Result<()> { + let indexing_start = Instant::now(); + let n_indexed = site.build_search_index() + .map_err(|e| Error::from(format!("creating elasticlunr index failed: {}", e)))?; + let indexing_took = Instant::now() - indexing_start; + console::report_n_pages_indexed(n_indexed, indexing_took); + Ok(()) + }; #[cfg(feature = "tantivy-indexing")] { match index_type { - "elasticlunr" => { - site.build_search_index()?; - } + "elasticlunr" => do_elastic_index()?, "tantivy" => { - //if ! Path::new(output_dir).exists() { - // std::fs::create_dir_all(output_dir)?; - //} let index_dir = Path::new(output_dir).join("tantivy-index"); + if index_dir.exists() { + std::fs::remove_dir_all(&index_dir)?; + } utils::fs::ensure_directory_exists(&index_dir)?; - let lang = &site.config.default_language; let library = site.library.read().unwrap(); // unwrap originally in Site::build_search_index, just parroting here, no idea if safe - search::build_tantivy_index(lang, &library, output_dir)?; + let indexing_start = Instant::now(); + let n_pages_indexed = search::build_tantivy_index(lang, &library, &index_dir)?; + let indexing_took = Instant::now() - indexing_start; + console::report_n_pages_indexed(n_pages_indexed, indexing_took); } _ => unreachable!() } } + #[cfg(not(feature = "tantivy-indexing"))] + { + do_elastic_index()?; + } + Ok(()) } diff --git a/src/console.rs b/src/console.rs index 9853866..2e5c4c7 100644 --- a/src/console.rs +++ b/src/console.rs @@ -97,6 +97,16 @@ pub fn warn_about_ignored_pages(site: &Site) { } } +pub fn report_n_pages_indexed(n_pages_indexed: usize, took: std::time::Duration) { + let duration_sec = took.as_secs_f64(); + let duration_ms = duration_sec * 1000.0; + if duration_ms < 1000.0 { + success(&format!("Search: indexed {} pages in {}ms.\n", n_pages_indexed, duration_ms)); + } else { + success(&format!("Search: indexed {} pages in {:.1}s.\n", n_pages_indexed, ((duration_sec * 10.0).round() / 10.0))); + } +} + /// Print the time elapsed rounded to 1 decimal pub fn report_elapsed_time(instant: Instant) { let duration_ms = Duration::from_std(instant.elapsed()).unwrap().num_milliseconds() as f64; diff --git a/src/main.rs b/src/main.rs index 8e1974c..0672955 100644 --- a/src/main.rs +++ b/src/main.rs @@ -115,16 +115,22 @@ fn main() { console::info("Building search index..."); let start = Instant::now(); let output_dir = matches.value_of("output_dir").unwrap(); - let index_type = matches.value_of("index_type").unwrap(); - match cmd::index( - &root_dir, - config_file, - matches.value_of("base_url"), - output_dir, - matches.is_present("drafts"), - index_type, - ) { + let show_drafts = matches.is_present("drafts"); + + let indexing_result = { + #[cfg(feature = "tantivy-indexing")] + { + let index_type = matches.value_of("index_type").unwrap(); + cmd::index( &root_dir, config_file, matches.value_of("base_url"), output_dir, show_drafts, index_type) + } + + #[cfg(not(feature = "tantivy-indexing"))] + cmd::index( &root_dir, config_file, matches.value_of("base_url"), output_dir, show_drafts) + }; + + match indexing_result { Ok(()) => console::report_elapsed_time(start), + Err(e) => { console::unravel_errors("Failed to build search index", &e); ::std::process::exit(1);