|
|
@@ -1,5 +1,6 @@ |
|
|
|
use std::str::FromStr; |
|
|
|
use std::collections::{HashMap, HashSet}; |
|
|
|
|
|
|
|
use chrono::{DateTime, Utc, NaiveDateTime, TimeZone}; |
|
|
|
use elasticlunr::{Index, Language}; |
|
|
|
use lazy_static::lazy_static; |
|
|
|
|
|
|
@@ -108,6 +109,29 @@ fn parse_language(lang: &str) -> Option<tantivy::tokenizer::Language> { |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
#[cfg(feature = "tantivy-indexing")] |
|
|
|
fn parse_dt_assume_utc(datetime_string: &Option<String>, naive_datetime: &Option<NaiveDateTime>) -> Option<DateTime<Utc>> { |
|
|
|
// start here because it will potentially have timezone in the string |
|
|
|
if let Some(s) = datetime_string.as_ref() { |
|
|
|
if let Ok(utc) = DateTime::from_str(s.as_str()) { |
|
|
|
return Some(utc) |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
// otherwise, if we have the NaiveDateTime, we'll assume it's UTC. would not do this if the |
|
|
|
// stakes were higher! |
|
|
|
if let Some(naive) = naive_datetime { |
|
|
|
return Some(Utc.from_utc_datetime(&naive)) |
|
|
|
} |
|
|
|
|
|
|
|
None |
|
|
|
} |
|
|
|
|
|
|
|
#[cfg(feature = "tantivy-indexing")] |
|
|
|
fn normalize_taxonomy_name(s: &str) -> String { |
|
|
|
s.replace("-", "_") |
|
|
|
} |
|
|
|
|
|
|
|
#[cfg(feature = "tantivy-indexing")] |
|
|
|
pub fn build_tantivy_index<P: AsRef<std::path::Path>>( |
|
|
|
lang: &str, |
|
|
@@ -134,18 +158,94 @@ pub fn build_tantivy_index<P: AsRef<std::path::Path>>( |
|
|
|
.set_indexing_options(text_indexing_options) |
|
|
|
.set_stored(); |
|
|
|
|
|
|
|
struct IndexContent<'a> { |
|
|
|
pub title: &'a str, |
|
|
|
pub description: &'a str, |
|
|
|
pub permalink: &'a str, |
|
|
|
pub body: String, |
|
|
|
|
|
|
|
pub datetime: Option<DateTime<Utc>>, |
|
|
|
pub taxonomies: &'a HashMap<String, Vec<String>>, |
|
|
|
} |
|
|
|
|
|
|
|
let mut seen: HashSet<String> = Default::default(); // unique permalinks already indexed |
|
|
|
let mut all_taxonomies: HashSet<String> = Default::default(); // remember any taxonomy used anywhere so we can add to schema |
|
|
|
let mut index_pages: Vec<IndexContent> = Vec::new(); |
|
|
|
let mut n_indexed = 0; |
|
|
|
|
|
|
|
let empty_taxonomies: HashMap<String, Vec<String>> = Default::default(); |
|
|
|
|
|
|
|
for section in library.sections_values() { |
|
|
|
|
|
|
|
// reason for macro: Section/Page are different types but have same attributes |
|
|
|
macro_rules! extract_content { |
|
|
|
($page:ident) => {{ |
|
|
|
let already_indexed = seen.contains(&$page.permalink); |
|
|
|
if ! already_indexed && $page.meta.in_search_index && $page.lang == lang { |
|
|
|
seen.insert($page.permalink.clone()); // mark ask indexed |
|
|
|
n_indexed += 1; |
|
|
|
|
|
|
|
let cleaned_body: String = AMMONIA.clean(&$page.content).to_string(); |
|
|
|
|
|
|
|
Some(IndexContent { |
|
|
|
title: $page.meta.title.as_ref().map(|x| x.as_str()).unwrap_or(""), |
|
|
|
description: $page.meta.description.as_ref().map(|x| x.as_str()).unwrap_or(""), |
|
|
|
permalink: $page.permalink.as_str(), |
|
|
|
body: cleaned_body, |
|
|
|
|
|
|
|
// page-only fields, leave blank |
|
|
|
datetime: None, |
|
|
|
taxonomies: &empty_taxonomies, |
|
|
|
}) |
|
|
|
} else { |
|
|
|
None |
|
|
|
} |
|
|
|
}} |
|
|
|
} |
|
|
|
|
|
|
|
if section.meta.redirect_to.is_none() { |
|
|
|
if let Some(content) = extract_content!(section) { |
|
|
|
index_pages.push(content); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
for key in §ion.pages { |
|
|
|
let page = library.get_page_by_key(*key); |
|
|
|
match extract_content!(page) { |
|
|
|
Some(mut index_content) => { |
|
|
|
all_taxonomies.extend(page.meta.taxonomies.keys().map(|x| normalize_taxonomy_name(x))); |
|
|
|
index_content.taxonomies = &page.meta.taxonomies; |
|
|
|
index_content.datetime = parse_dt_assume_utc(&page.meta.date, &page.meta.datetime); |
|
|
|
index_pages.push(index_content); |
|
|
|
} |
|
|
|
None => {} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
let mut schema = SchemaBuilder::new(); |
|
|
|
|
|
|
|
let title = schema.add_text_field("title", text_options.clone()); |
|
|
|
let body = schema.add_text_field("body", text_options.clone()); |
|
|
|
let permalink = schema.add_text_field("permalink", STORED); |
|
|
|
let mut fields: HashMap<String, Field> = Default::default(); |
|
|
|
|
|
|
|
for text_field_name in &["title", "body", "description"] { |
|
|
|
fields.insert(text_field_name.to_string(), schema.add_text_field(text_field_name, text_options.clone())); |
|
|
|
} |
|
|
|
fields.insert("permalink".to_string(), schema.add_text_field("permalink", STORED)); |
|
|
|
fields.insert("datetime".to_string(), schema.add_date_field("datetime", STORED | INDEXED)); |
|
|
|
|
|
|
|
let reserved_field_names: HashSet<String> = fields.keys().map(|s| s.to_string()).collect(); |
|
|
|
|
|
|
|
for taxonomy_name in all_taxonomies.difference(&reserved_field_names) { |
|
|
|
fields.insert(taxonomy_name.to_string(), schema.add_text_field(taxonomy_name.as_str(), text_options.clone())); |
|
|
|
} |
|
|
|
|
|
|
|
let schema = schema.build(); |
|
|
|
|
|
|
|
let index = Index::create_in_dir(&index_dir, schema.clone()) |
|
|
|
.map_err(|e| { Error::from(format!("creating tantivy index failed: {}", e)) })?; |
|
|
|
|
|
|
|
if index.tokenizers().get(&tokenizer_name).is_none() { // if non-english, we need to register stemmer |
|
|
|
// take care of non-English stemmers if needed |
|
|
|
if index.tokenizers().get(&tokenizer_name).is_none() { |
|
|
|
let tokenizer = TextAnalyzer::from(SimpleTokenizer) |
|
|
|
.filter(RemoveLongFilter::limit(40)) |
|
|
|
.filter(LowerCaser) |
|
|
@@ -156,37 +256,28 @@ pub fn build_tantivy_index<P: AsRef<std::path::Path>>( |
|
|
|
let mut wtr = index.writer(1024 * 1024 * 256) |
|
|
|
.map_err(|e| { Error::from(format!("creating tantivy index writer failed: {}", e)) })?; |
|
|
|
|
|
|
|
let mut seen: HashSet<String> = Default::default(); |
|
|
|
let mut n_indexed = 0; |
|
|
|
// now, let's index! |
|
|
|
|
|
|
|
for section in library.sections_values() { |
|
|
|
for page in index_pages { |
|
|
|
let mut document: Document = doc!( |
|
|
|
fields["title"] => page.title, |
|
|
|
fields["description"] => page.description, |
|
|
|
fields["permalink"] => page.permalink, |
|
|
|
fields["body"] => page.body, |
|
|
|
); |
|
|
|
|
|
|
|
// reason for macro: Section/Page are different types but have same attributes |
|
|
|
macro_rules! index_page { |
|
|
|
($page:ident) => {{ |
|
|
|
let already_indexed = seen.contains(&$page.permalink); |
|
|
|
if ! already_indexed && $page.meta.in_search_index && $page.lang == lang { |
|
|
|
seen.insert($page.permalink.clone()); // mark ask indexed |
|
|
|
let cleaned_body: String = AMMONIA.clean(&$page.content).to_string(); |
|
|
|
let page_doc: Document = doc!( |
|
|
|
title => $page.meta.title.as_ref().map(|x| x.as_str()).unwrap_or(""), |
|
|
|
body => cleaned_body.as_str(), |
|
|
|
permalink => $page.permalink.as_str(), |
|
|
|
); |
|
|
|
wtr.add_document(page_doc); |
|
|
|
n_indexed += 1; |
|
|
|
} |
|
|
|
}} |
|
|
|
if let Some(utc) = page.datetime { |
|
|
|
document.add_date(fields["datetime"], &utc); |
|
|
|
} |
|
|
|
|
|
|
|
if section.meta.redirect_to.is_none() { |
|
|
|
index_page!(section); |
|
|
|
for (taxonomy, terms) in page.taxonomies.iter().filter(|(k, _)| ! reserved_field_names.contains(k.as_str())) { |
|
|
|
let normalized_taxonomy = normalize_taxonomy_name(taxonomy); |
|
|
|
for term in terms.iter() { |
|
|
|
document.add_text(fields[&normalized_taxonomy], term); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
for key in §ion.pages { |
|
|
|
let page = library.get_page_by_key(*key); |
|
|
|
index_page!(page); |
|
|
|
} |
|
|
|
wtr.add_document(document); |
|
|
|
} |
|
|
|
|
|
|
|
//wtr.prepare_commit().map_err(|e| { Error::from(format!("tantivy IndexWriter::commit failed: {}", e)) })?; |
|
|
|