From ad36ac87b26437c8eda37502bb52a50710a8a84a Mon Sep 17 00:00:00 2001 From: Jonathan Strong Date: Fri, 13 Mar 2020 04:20:31 -0400 Subject: [PATCH] adds (tantivy) indexing of taxonomies, datetime, description --- components/search/Cargo.toml | 4 +- components/search/src/lib.rs | 151 ++++++++++++++++++++++++++++------- 2 files changed, 124 insertions(+), 31 deletions(-) diff --git a/components/search/Cargo.toml b/components/search/Cargo.toml index 209bf22..be6bcaa 100644 --- a/components/search/Cargo.toml +++ b/components/search/Cargo.toml @@ -14,10 +14,12 @@ serde = { version = "1.0", optional = true } serde_derive = { version = "1.0", optional = true } serde_json = { version = "1.0", optional = true } num_cpus = { version = "1.12", optional = true } +chrono = { version = "0.4", optional = true } + errors = { path = "../errors" } library = { path = "../library" } [features] default = [] -tantivy-indexing = ["isolang", "serde", "serde_derive", "serde_json", "num_cpus", "tantivy"] +tantivy-indexing = ["isolang", "serde", "serde_derive", "serde_json", "num_cpus", "tantivy", "chrono"] diff --git a/components/search/src/lib.rs b/components/search/src/lib.rs index 30a7e0f..7a4e5fd 100644 --- a/components/search/src/lib.rs +++ b/components/search/src/lib.rs @@ -1,5 +1,6 @@ +use std::str::FromStr; use std::collections::{HashMap, HashSet}; - +use chrono::{DateTime, Utc, NaiveDateTime, TimeZone}; use elasticlunr::{Index, Language}; use lazy_static::lazy_static; @@ -108,6 +109,29 @@ fn parse_language(lang: &str) -> Option { } } +#[cfg(feature = "tantivy-indexing")] +fn parse_dt_assume_utc(datetime_string: &Option, naive_datetime: &Option) -> Option> { + // start here because it will potentially have timezone in the string + if let Some(s) = datetime_string.as_ref() { + if let Ok(utc) = DateTime::from_str(s.as_str()) { + return Some(utc) + } + } + + // otherwise, if we have the NaiveDateTime, we'll assume it's UTC. would not do this if the + // stakes were higher! + if let Some(naive) = naive_datetime { + return Some(Utc.from_utc_datetime(&naive)) + } + + None +} + +#[cfg(feature = "tantivy-indexing")] +fn normalize_taxonomy_name(s: &str) -> String { + s.replace("-", "_") +} + #[cfg(feature = "tantivy-indexing")] pub fn build_tantivy_index>( lang: &str, @@ -134,18 +158,94 @@ pub fn build_tantivy_index>( .set_indexing_options(text_indexing_options) .set_stored(); + struct IndexContent<'a> { + pub title: &'a str, + pub description: &'a str, + pub permalink: &'a str, + pub body: String, + + pub datetime: Option>, + pub taxonomies: &'a HashMap>, + } + + let mut seen: HashSet = Default::default(); // unique permalinks already indexed + let mut all_taxonomies: HashSet = Default::default(); // remember any taxonomy used anywhere so we can add to schema + let mut index_pages: Vec = Vec::new(); + let mut n_indexed = 0; + + let empty_taxonomies: HashMap> = Default::default(); + + for section in library.sections_values() { + + // reason for macro: Section/Page are different types but have same attributes + macro_rules! extract_content { + ($page:ident) => {{ + let already_indexed = seen.contains(&$page.permalink); + if ! already_indexed && $page.meta.in_search_index && $page.lang == lang { + seen.insert($page.permalink.clone()); // mark ask indexed + n_indexed += 1; + + let cleaned_body: String = AMMONIA.clean(&$page.content).to_string(); + + Some(IndexContent { + title: $page.meta.title.as_ref().map(|x| x.as_str()).unwrap_or(""), + description: $page.meta.description.as_ref().map(|x| x.as_str()).unwrap_or(""), + permalink: $page.permalink.as_str(), + body: cleaned_body, + + // page-only fields, leave blank + datetime: None, + taxonomies: &empty_taxonomies, + }) + } else { + None + } + }} + } + + if section.meta.redirect_to.is_none() { + if let Some(content) = extract_content!(section) { + index_pages.push(content); + } + } + + for key in §ion.pages { + let page = library.get_page_by_key(*key); + match extract_content!(page) { + Some(mut index_content) => { + all_taxonomies.extend(page.meta.taxonomies.keys().map(|x| normalize_taxonomy_name(x))); + index_content.taxonomies = &page.meta.taxonomies; + index_content.datetime = parse_dt_assume_utc(&page.meta.date, &page.meta.datetime); + index_pages.push(index_content); + } + None => {} + } + } + } + let mut schema = SchemaBuilder::new(); - let title = schema.add_text_field("title", text_options.clone()); - let body = schema.add_text_field("body", text_options.clone()); - let permalink = schema.add_text_field("permalink", STORED); + let mut fields: HashMap = Default::default(); + + for text_field_name in &["title", "body", "description"] { + fields.insert(text_field_name.to_string(), schema.add_text_field(text_field_name, text_options.clone())); + } + fields.insert("permalink".to_string(), schema.add_text_field("permalink", STORED)); + fields.insert("datetime".to_string(), schema.add_date_field("datetime", STORED | INDEXED)); + + let reserved_field_names: HashSet = fields.keys().map(|s| s.to_string()).collect(); + + for taxonomy_name in all_taxonomies.difference(&reserved_field_names) { + fields.insert(taxonomy_name.to_string(), schema.add_text_field(taxonomy_name.as_str(), text_options.clone())); + } let schema = schema.build(); let index = Index::create_in_dir(&index_dir, schema.clone()) .map_err(|e| { Error::from(format!("creating tantivy index failed: {}", e)) })?; - if index.tokenizers().get(&tokenizer_name).is_none() { // if non-english, we need to register stemmer + // take care of non-English stemmers if needed + if index.tokenizers().get(&tokenizer_name).is_none() { let tokenizer = TextAnalyzer::from(SimpleTokenizer) .filter(RemoveLongFilter::limit(40)) .filter(LowerCaser) @@ -156,37 +256,28 @@ pub fn build_tantivy_index>( let mut wtr = index.writer(1024 * 1024 * 256) .map_err(|e| { Error::from(format!("creating tantivy index writer failed: {}", e)) })?; - let mut seen: HashSet = Default::default(); - let mut n_indexed = 0; + // now, let's index! - for section in library.sections_values() { + for page in index_pages { + let mut document: Document = doc!( + fields["title"] => page.title, + fields["description"] => page.description, + fields["permalink"] => page.permalink, + fields["body"] => page.body, + ); - // reason for macro: Section/Page are different types but have same attributes - macro_rules! index_page { - ($page:ident) => {{ - let already_indexed = seen.contains(&$page.permalink); - if ! already_indexed && $page.meta.in_search_index && $page.lang == lang { - seen.insert($page.permalink.clone()); // mark ask indexed - let cleaned_body: String = AMMONIA.clean(&$page.content).to_string(); - let page_doc: Document = doc!( - title => $page.meta.title.as_ref().map(|x| x.as_str()).unwrap_or(""), - body => cleaned_body.as_str(), - permalink => $page.permalink.as_str(), - ); - wtr.add_document(page_doc); - n_indexed += 1; - } - }} + if let Some(utc) = page.datetime { + document.add_date(fields["datetime"], &utc); } - if section.meta.redirect_to.is_none() { - index_page!(section); + for (taxonomy, terms) in page.taxonomies.iter().filter(|(k, _)| ! reserved_field_names.contains(k.as_str())) { + let normalized_taxonomy = normalize_taxonomy_name(taxonomy); + for term in terms.iter() { + document.add_text(fields[&normalized_taxonomy], term); + } } - for key in §ion.pages { - let page = library.get_page_by_key(*key); - index_page!(page); - } + wtr.add_document(document); } //wtr.prepare_commit().map_err(|e| { Error::from(format!("tantivy IndexWriter::commit failed: {}", e)) })?;