Browse Source

adds (tantivy) indexing of taxonomies, datetime, description

Jonathan Strong 4 years ago
2 changed files with 124 additions and 31 deletions
  1. +3
  2. +121

+ 3
- 1
components/search/Cargo.toml View File

@@ -14,10 +14,12 @@ serde = { version = "1.0", optional = true }
serde_derive = { version = "1.0", optional = true } serde_derive = { version = "1.0", optional = true }
serde_json = { version = "1.0", optional = true } serde_json = { version = "1.0", optional = true }
num_cpus = { version = "1.12", optional = true } num_cpus = { version = "1.12", optional = true }
chrono = { version = "0.4", optional = true }

errors = { path = "../errors" } errors = { path = "../errors" }
library = { path = "../library" } library = { path = "../library" }

[features] [features]
default = [] default = []
tantivy-indexing = ["isolang", "serde", "serde_derive", "serde_json", "num_cpus", "tantivy"]
tantivy-indexing = ["isolang", "serde", "serde_derive", "serde_json", "num_cpus", "tantivy", "chrono"]

+ 121
- 30
components/search/src/ View File

@@ -1,5 +1,6 @@
use std::str::FromStr;
use std::collections::{HashMap, HashSet}; use std::collections::{HashMap, HashSet};
use chrono::{DateTime, Utc, NaiveDateTime, TimeZone};
use elasticlunr::{Index, Language}; use elasticlunr::{Index, Language};
use lazy_static::lazy_static; use lazy_static::lazy_static;

@@ -108,6 +109,29 @@ fn parse_language(lang: &str) -> Option<tantivy::tokenizer::Language> {
} }
} }

#[cfg(feature = "tantivy-indexing")]
fn parse_dt_assume_utc(datetime_string: &Option<String>, naive_datetime: &Option<NaiveDateTime>) -> Option<DateTime<Utc>> {
// start here because it will potentially have timezone in the string
if let Some(s) = datetime_string.as_ref() {
if let Ok(utc) = DateTime::from_str(s.as_str()) {
return Some(utc)

// otherwise, if we have the NaiveDateTime, we'll assume it's UTC. would not do this if the
// stakes were higher!
if let Some(naive) = naive_datetime {
return Some(Utc.from_utc_datetime(&naive))


#[cfg(feature = "tantivy-indexing")]
fn normalize_taxonomy_name(s: &str) -> String {
s.replace("-", "_")

#[cfg(feature = "tantivy-indexing")] #[cfg(feature = "tantivy-indexing")]
pub fn build_tantivy_index<P: AsRef<std::path::Path>>( pub fn build_tantivy_index<P: AsRef<std::path::Path>>(
lang: &str, lang: &str,
@@ -134,18 +158,94 @@ pub fn build_tantivy_index<P: AsRef<std::path::Path>>(
.set_indexing_options(text_indexing_options) .set_indexing_options(text_indexing_options)
.set_stored(); .set_stored();

struct IndexContent<'a> {
pub title: &'a str,
pub description: &'a str,
pub permalink: &'a str,
pub body: String,

pub datetime: Option<DateTime<Utc>>,
pub taxonomies: &'a HashMap<String, Vec<String>>,

let mut seen: HashSet<String> = Default::default(); // unique permalinks already indexed
let mut all_taxonomies: HashSet<String> = Default::default(); // remember any taxonomy used anywhere so we can add to schema
let mut index_pages: Vec<IndexContent> = Vec::new();
let mut n_indexed = 0;

let empty_taxonomies: HashMap<String, Vec<String>> = Default::default();

for section in library.sections_values() {

// reason for macro: Section/Page are different types but have same attributes
macro_rules! extract_content {
($page:ident) => {{
let already_indexed = seen.contains(&$page.permalink);
if ! already_indexed && $page.meta.in_search_index && $page.lang == lang {
seen.insert($page.permalink.clone()); // mark ask indexed
n_indexed += 1;

let cleaned_body: String = AMMONIA.clean(&$page.content).to_string();

Some(IndexContent {
title: $page.meta.title.as_ref().map(|x| x.as_str()).unwrap_or(""),
description: $page.meta.description.as_ref().map(|x| x.as_str()).unwrap_or(""),
permalink: $page.permalink.as_str(),
body: cleaned_body,

// page-only fields, leave blank
datetime: None,
taxonomies: &empty_taxonomies,
} else {

if section.meta.redirect_to.is_none() {
if let Some(content) = extract_content!(section) {

for key in &section.pages {
let page = library.get_page_by_key(*key);
match extract_content!(page) {
Some(mut index_content) => {
all_taxonomies.extend(page.meta.taxonomies.keys().map(|x| normalize_taxonomy_name(x)));
index_content.taxonomies = &page.meta.taxonomies;
index_content.datetime = parse_dt_assume_utc(&, &page.meta.datetime);
None => {}

let mut schema = SchemaBuilder::new(); let mut schema = SchemaBuilder::new();

let title = schema.add_text_field("title", text_options.clone());
let body = schema.add_text_field("body", text_options.clone());
let permalink = schema.add_text_field("permalink", STORED);
let mut fields: HashMap<String, Field> = Default::default();

for text_field_name in &["title", "body", "description"] {
fields.insert(text_field_name.to_string(), schema.add_text_field(text_field_name, text_options.clone()));
fields.insert("permalink".to_string(), schema.add_text_field("permalink", STORED));
fields.insert("datetime".to_string(), schema.add_date_field("datetime", STORED | INDEXED));

let reserved_field_names: HashSet<String> = fields.keys().map(|s| s.to_string()).collect();

for taxonomy_name in all_taxonomies.difference(&reserved_field_names) {
fields.insert(taxonomy_name.to_string(), schema.add_text_field(taxonomy_name.as_str(), text_options.clone()));

let schema =; let schema =;

let index = Index::create_in_dir(&index_dir, schema.clone()) let index = Index::create_in_dir(&index_dir, schema.clone())
.map_err(|e| { Error::from(format!("creating tantivy index failed: {}", e)) })?; .map_err(|e| { Error::from(format!("creating tantivy index failed: {}", e)) })?;

if index.tokenizers().get(&tokenizer_name).is_none() { // if non-english, we need to register stemmer
// take care of non-English stemmers if needed
if index.tokenizers().get(&tokenizer_name).is_none() {
let tokenizer = TextAnalyzer::from(SimpleTokenizer) let tokenizer = TextAnalyzer::from(SimpleTokenizer)
.filter(RemoveLongFilter::limit(40)) .filter(RemoveLongFilter::limit(40))
.filter(LowerCaser) .filter(LowerCaser)
@@ -156,37 +256,28 @@ pub fn build_tantivy_index<P: AsRef<std::path::Path>>(
let mut wtr = index.writer(1024 * 1024 * 256) let mut wtr = index.writer(1024 * 1024 * 256)
.map_err(|e| { Error::from(format!("creating tantivy index writer failed: {}", e)) })?; .map_err(|e| { Error::from(format!("creating tantivy index writer failed: {}", e)) })?;

let mut seen: HashSet<String> = Default::default();
let mut n_indexed = 0;
// now, let's index!

for section in library.sections_values() {
for page in index_pages {
let mut document: Document = doc!(
fields["title"] => page.title,
fields["description"] => page.description,
fields["permalink"] => page.permalink,
fields["body"] => page.body,

// reason for macro: Section/Page are different types but have same attributes
macro_rules! index_page {
($page:ident) => {{
let already_indexed = seen.contains(&$page.permalink);
if ! already_indexed && $page.meta.in_search_index && $page.lang == lang {
seen.insert($page.permalink.clone()); // mark ask indexed
let cleaned_body: String = AMMONIA.clean(&$page.content).to_string();
let page_doc: Document = doc!(
title => $page.meta.title.as_ref().map(|x| x.as_str()).unwrap_or(""),
body => cleaned_body.as_str(),
permalink => $page.permalink.as_str(),
n_indexed += 1;
if let Some(utc) = page.datetime {
document.add_date(fields["datetime"], &utc);
} }

if section.meta.redirect_to.is_none() {
for (taxonomy, terms) in page.taxonomies.iter().filter(|(k, _)| ! reserved_field_names.contains(k.as_str())) {
let normalized_taxonomy = normalize_taxonomy_name(taxonomy);
for term in terms.iter() {
document.add_text(fields[&normalized_taxonomy], term);
} }

for key in &section.pages {
let page = library.get_page_by_key(*key);
} }

//wtr.prepare_commit().map_err(|e| { Error::from(format!("tantivy IndexWriter::commit failed: {}", e)) })?; //wtr.prepare_commit().map_err(|e| { Error::from(format!("tantivy IndexWriter::commit failed: {}", e)) })?;
