From ad36ac87b26437c8eda37502bb52a50710a8a84a Mon Sep 17 00:00:00 2001
From: Jonathan Strong <jonathan.strong@gmail.com>
Date: Fri, 13 Mar 2020 04:20:31 -0400
Subject: [PATCH] adds (tantivy) indexing of taxonomies, datetime, description

---
 components/search/Cargo.toml |   4 +-
 components/search/src/lib.rs | 151 ++++++++++++++++++++++++++++-------
 2 files changed, 124 insertions(+), 31 deletions(-)
diff --git a/components/search/Cargo.toml b/components/search/Cargo.toml
index 209bf22..be6bcaa 100644
--- a/components/search/Cargo.toml
+++ b/components/search/Cargo.toml
@@ -14,10 +14,12 @@ serde = { version = "1.0", optional = true }
 serde_derive = { version = "1.0", optional = true }
 serde_json = { version = "1.0", optional = true }
 num_cpus = { version = "1.12", optional = true }
+chrono = { version = "0.4", optional = true }
+
 
 errors = { path = "../errors" }
 library = { path = "../library" }
 
 [features]
 default = []
-tantivy-indexing = ["isolang", "serde", "serde_derive", "serde_json", "num_cpus", "tantivy"]
+tantivy-indexing = ["isolang", "serde", "serde_derive", "serde_json", "num_cpus", "tantivy", "chrono"]
diff --git a/components/search/src/lib.rs b/components/search/src/lib.rs
index 30a7e0f..7a4e5fd 100644
--- a/components/search/src/lib.rs
+++ b/components/search/src/lib.rs
@@ -1,5 +1,6 @@
+use std::str::FromStr;
 use std::collections::{HashMap, HashSet};
-
+use chrono::{DateTime, Utc, NaiveDateTime, TimeZone};
 use elasticlunr::{Index, Language};
 use lazy_static::lazy_static;
 
@@ -108,6 +109,29 @@ fn parse_language(lang: &str) -> Option<tantivy::tokenizer::Language> {
     }
 }
 
+#[cfg(feature = "tantivy-indexing")]
+fn parse_dt_assume_utc(datetime_string: &Option<String>, naive_datetime: &Option<NaiveDateTime>) -> Option<DateTime<Utc>> {
+    // start here because it will potentially have timezone in the string
+    if let Some(s) = datetime_string.as_ref() {
+        if let Ok(utc) = DateTime::from_str(s.as_str()) {
+            return Some(utc)
+        }
+    }
+
+    // otherwise, if we have the NaiveDateTime, we'll assume it's UTC. would not do this if the
+    // stakes were higher!
+    if let Some(naive) = naive_datetime {
+        return Some(Utc.from_utc_datetime(&naive))
+    }
+
+    None
+}
+
+#[cfg(feature = "tantivy-indexing")]
+fn normalize_taxonomy_name(s: &str) -> String {
+    s.replace("-", "_")
+}
+
 #[cfg(feature = "tantivy-indexing")]
 pub fn build_tantivy_index<P: AsRef<std::path::Path>>(
     lang: &str,
@@ -134,18 +158,94 @@ pub fn build_tantivy_index<P: AsRef<std::path::Path>>(
         .set_indexing_options(text_indexing_options)
         .set_stored();
 
+    struct IndexContent<'a> {
+        pub title: &'a str,
+        pub description: &'a str,
+        pub permalink: &'a str,
+        pub body: String,
+
+        pub datetime: Option<DateTime<Utc>>,
+        pub taxonomies: &'a HashMap<String, Vec<String>>,
+    }
+
+    let mut seen: HashSet<String> = Default::default(); // unique permalinks already indexed
+    let mut all_taxonomies: HashSet<String> = Default::default(); // remember any taxonomy used anywhere so we can add to schema 
+    let mut index_pages: Vec<IndexContent> = Vec::new();
+    let mut n_indexed = 0;
+
+    let empty_taxonomies: HashMap<String, Vec<String>> = Default::default();
+
+    for section in library.sections_values() {
+
+        // reason for macro: Section/Page are different types but have same attributes
+        macro_rules! extract_content {
+            ($page:ident) => {{
+                let already_indexed = seen.contains(&$page.permalink);
+                if ! already_indexed  && $page.meta.in_search_index && $page.lang == lang {
+                    seen.insert($page.permalink.clone()); // mark ask indexed
+                    n_indexed += 1;
+
+                    let cleaned_body: String = AMMONIA.clean(&$page.content).to_string();
+
+                    Some(IndexContent {
+                        title: $page.meta.title.as_ref().map(|x| x.as_str()).unwrap_or(""),
+                        description: $page.meta.description.as_ref().map(|x| x.as_str()).unwrap_or(""),
+                        permalink:  $page.permalink.as_str(),
+                        body: cleaned_body,
+
+                        // page-only fields, leave blank
+                        datetime: None,
+                        taxonomies: &empty_taxonomies,
+                    })
+                } else {
+                    None
+                }
+            }}
+        }
+
+        if section.meta.redirect_to.is_none() {
+            if let Some(content) = extract_content!(section) {
+                index_pages.push(content);
+            }
+        }
+
+        for key in &section.pages {
+            let page = library.get_page_by_key(*key);
+            match extract_content!(page) {
+                Some(mut index_content) => {
+                    all_taxonomies.extend(page.meta.taxonomies.keys().map(|x| normalize_taxonomy_name(x)));
+                    index_content.taxonomies = &page.meta.taxonomies;
+                    index_content.datetime = parse_dt_assume_utc(&page.meta.date, &page.meta.datetime);
+                    index_pages.push(index_content);
+                }
+                None => {}
+            }
+        }
+    }
+
     let mut schema = SchemaBuilder::new();
 
-    let title = schema.add_text_field("title", text_options.clone());
-    let body = schema.add_text_field("body", text_options.clone());
-    let permalink = schema.add_text_field("permalink", STORED); 
+    let mut fields: HashMap<String, Field> = Default::default();
+
+    for text_field_name in &["title", "body", "description"] {
+        fields.insert(text_field_name.to_string(), schema.add_text_field(text_field_name, text_options.clone()));
+    }
+    fields.insert("permalink".to_string(), schema.add_text_field("permalink", STORED)); 
+    fields.insert("datetime".to_string(), schema.add_date_field("datetime", STORED | INDEXED)); 
+
+    let reserved_field_names: HashSet<String> = fields.keys().map(|s| s.to_string()).collect();
+
+    for taxonomy_name in all_taxonomies.difference(&reserved_field_names) {
+        fields.insert(taxonomy_name.to_string(), schema.add_text_field(taxonomy_name.as_str(), text_options.clone()));
+    }
 
     let schema = schema.build(); 
 
     let index = Index::create_in_dir(&index_dir, schema.clone())
         .map_err(|e| { Error::from(format!("creating tantivy index failed: {}", e)) })?;
 
-    if index.tokenizers().get(&tokenizer_name).is_none() { // if non-english, we need to register stemmer
+    // take care of non-English stemmers if needed
+    if index.tokenizers().get(&tokenizer_name).is_none() {
         let tokenizer = TextAnalyzer::from(SimpleTokenizer)
             .filter(RemoveLongFilter::limit(40))
             .filter(LowerCaser)
@@ -156,37 +256,28 @@ pub fn build_tantivy_index<P: AsRef<std::path::Path>>(
     let mut wtr = index.writer(1024 * 1024 * 256)
         .map_err(|e| { Error::from(format!("creating tantivy index writer failed: {}", e)) })?;
 
-    let mut seen: HashSet<String> = Default::default();
-    let mut n_indexed = 0;
+    // now, let's index!
 
-    for section in library.sections_values() {
+    for page in index_pages {
+        let mut document: Document = doc!(
+            fields["title"] => page.title,
+            fields["description"] => page.description,
+            fields["permalink"] => page.permalink,
+            fields["body"] => page.body,
+        );
 
-        // reason for macro: Section/Page are different types but have same attributes
-        macro_rules! index_page {
-            ($page:ident) => {{
-                let already_indexed = seen.contains(&$page.permalink);
-                if ! already_indexed  && $page.meta.in_search_index && $page.lang == lang {
-                    seen.insert($page.permalink.clone()); // mark ask indexed
-                    let cleaned_body: String = AMMONIA.clean(&$page.content).to_string();
-                    let page_doc: Document = doc!(
-                        title => $page.meta.title.as_ref().map(|x| x.as_str()).unwrap_or(""),
-                        body => cleaned_body.as_str(),
-                        permalink => $page.permalink.as_str(),
-                    );
-                    wtr.add_document(page_doc);
-                    n_indexed += 1;
-                }
-            }}
+        if let Some(utc) = page.datetime {
+            document.add_date(fields["datetime"], &utc);
         }
 
-        if section.meta.redirect_to.is_none() {
-            index_page!(section);
+        for (taxonomy, terms) in page.taxonomies.iter().filter(|(k, _)| ! reserved_field_names.contains(k.as_str())) {
+            let normalized_taxonomy = normalize_taxonomy_name(taxonomy);
+            for term in terms.iter() {
+                document.add_text(fields[&normalized_taxonomy], term);
+            }
         }
 
-        for key in &section.pages {
-            let page = library.get_page_by_key(*key);
-            index_page!(page);
-        }
+        wtr.add_document(document);
     }
 
     //wtr.prepare_commit().map_err(|e| { Error::from(format!("tantivy IndexWriter::commit failed: {}", e)) })?;