Browse Source

adds (tantivy) indexing of taxonomies, datetime, description

index-subcmd
Jonathan Strong 1 year ago
parent
commit
ad36ac87b2
2 changed files with 124 additions and 31 deletions
  1. +3
    -1
      components/search/Cargo.toml
  2. +121
    -30
      components/search/src/lib.rs

+ 3
- 1
components/search/Cargo.toml View File

@@ -14,10 +14,12 @@ serde = { version = "1.0", optional = true }
serde_derive = { version = "1.0", optional = true }
serde_json = { version = "1.0", optional = true }
num_cpus = { version = "1.12", optional = true }
chrono = { version = "0.4", optional = true }


errors = { path = "../errors" }
library = { path = "../library" }

[features]
default = []
tantivy-indexing = ["isolang", "serde", "serde_derive", "serde_json", "num_cpus", "tantivy"]
tantivy-indexing = ["isolang", "serde", "serde_derive", "serde_json", "num_cpus", "tantivy", "chrono"]

+ 121
- 30
components/search/src/lib.rs View File

@@ -1,5 +1,6 @@
use std::str::FromStr;
use std::collections::{HashMap, HashSet};
use chrono::{DateTime, Utc, NaiveDateTime, TimeZone};
use elasticlunr::{Index, Language};
use lazy_static::lazy_static;

@@ -108,6 +109,29 @@ fn parse_language(lang: &str) -> Option<tantivy::tokenizer::Language> {
}
}

#[cfg(feature = "tantivy-indexing")]
fn parse_dt_assume_utc(datetime_string: &Option<String>, naive_datetime: &Option<NaiveDateTime>) -> Option<DateTime<Utc>> {
// start here because it will potentially have timezone in the string
if let Some(s) = datetime_string.as_ref() {
if let Ok(utc) = DateTime::from_str(s.as_str()) {
return Some(utc)
}
}

// otherwise, if we have the NaiveDateTime, we'll assume it's UTC. would not do this if the
// stakes were higher!
if let Some(naive) = naive_datetime {
return Some(Utc.from_utc_datetime(&naive))
}

None
}

#[cfg(feature = "tantivy-indexing")]
fn normalize_taxonomy_name(s: &str) -> String {
s.replace("-", "_")
}

#[cfg(feature = "tantivy-indexing")]
pub fn build_tantivy_index<P: AsRef<std::path::Path>>(
lang: &str,
@@ -134,18 +158,94 @@ pub fn build_tantivy_index<P: AsRef<std::path::Path>>(
.set_indexing_options(text_indexing_options)
.set_stored();

struct IndexContent<'a> {
pub title: &'a str,
pub description: &'a str,
pub permalink: &'a str,
pub body: String,

pub datetime: Option<DateTime<Utc>>,
pub taxonomies: &'a HashMap<String, Vec<String>>,
}

let mut seen: HashSet<String> = Default::default(); // unique permalinks already indexed
let mut all_taxonomies: HashSet<String> = Default::default(); // remember any taxonomy used anywhere so we can add to schema
let mut index_pages: Vec<IndexContent> = Vec::new();
let mut n_indexed = 0;

let empty_taxonomies: HashMap<String, Vec<String>> = Default::default();

for section in library.sections_values() {

// reason for macro: Section/Page are different types but have same attributes
macro_rules! extract_content {
($page:ident) => {{
let already_indexed = seen.contains(&$page.permalink);
if ! already_indexed && $page.meta.in_search_index && $page.lang == lang {
seen.insert($page.permalink.clone()); // mark ask indexed
n_indexed += 1;

let cleaned_body: String = AMMONIA.clean(&$page.content).to_string();

Some(IndexContent {
title: $page.meta.title.as_ref().map(|x| x.as_str()).unwrap_or(""),
description: $page.meta.description.as_ref().map(|x| x.as_str()).unwrap_or(""),
permalink: $page.permalink.as_str(),
body: cleaned_body,

// page-only fields, leave blank
datetime: None,
taxonomies: &empty_taxonomies,
})
} else {
None
}
}}
}

if section.meta.redirect_to.is_none() {
if let Some(content) = extract_content!(section) {
index_pages.push(content);
}
}

for key in &section.pages {
let page = library.get_page_by_key(*key);
match extract_content!(page) {
Some(mut index_content) => {
all_taxonomies.extend(page.meta.taxonomies.keys().map(|x| normalize_taxonomy_name(x)));
index_content.taxonomies = &page.meta.taxonomies;
index_content.datetime = parse_dt_assume_utc(&page.meta.date, &page.meta.datetime);
index_pages.push(index_content);
}
None => {}
}
}
}

let mut schema = SchemaBuilder::new();

let title = schema.add_text_field("title", text_options.clone());
let body = schema.add_text_field("body", text_options.clone());
let permalink = schema.add_text_field("permalink", STORED);
let mut fields: HashMap<String, Field> = Default::default();

for text_field_name in &["title", "body", "description"] {
fields.insert(text_field_name.to_string(), schema.add_text_field(text_field_name, text_options.clone()));
}
fields.insert("permalink".to_string(), schema.add_text_field("permalink", STORED));
fields.insert("datetime".to_string(), schema.add_date_field("datetime", STORED | INDEXED));

let reserved_field_names: HashSet<String> = fields.keys().map(|s| s.to_string()).collect();

for taxonomy_name in all_taxonomies.difference(&reserved_field_names) {
fields.insert(taxonomy_name.to_string(), schema.add_text_field(taxonomy_name.as_str(), text_options.clone()));
}

let schema = schema.build();

let index = Index::create_in_dir(&index_dir, schema.clone())
.map_err(|e| { Error::from(format!("creating tantivy index failed: {}", e)) })?;

if index.tokenizers().get(&tokenizer_name).is_none() { // if non-english, we need to register stemmer
// take care of non-English stemmers if needed
if index.tokenizers().get(&tokenizer_name).is_none() {
let tokenizer = TextAnalyzer::from(SimpleTokenizer)
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser)
@@ -156,37 +256,28 @@ pub fn build_tantivy_index<P: AsRef<std::path::Path>>(
let mut wtr = index.writer(1024 * 1024 * 256)
.map_err(|e| { Error::from(format!("creating tantivy index writer failed: {}", e)) })?;

let mut seen: HashSet<String> = Default::default();
let mut n_indexed = 0;
// now, let's index!

for section in library.sections_values() {
for page in index_pages {
let mut document: Document = doc!(
fields["title"] => page.title,
fields["description"] => page.description,
fields["permalink"] => page.permalink,
fields["body"] => page.body,
);

// reason for macro: Section/Page are different types but have same attributes
macro_rules! index_page {
($page:ident) => {{
let already_indexed = seen.contains(&$page.permalink);
if ! already_indexed && $page.meta.in_search_index && $page.lang == lang {
seen.insert($page.permalink.clone()); // mark ask indexed
let cleaned_body: String = AMMONIA.clean(&$page.content).to_string();
let page_doc: Document = doc!(
title => $page.meta.title.as_ref().map(|x| x.as_str()).unwrap_or(""),
body => cleaned_body.as_str(),
permalink => $page.permalink.as_str(),
);
wtr.add_document(page_doc);
n_indexed += 1;
}
}}
if let Some(utc) = page.datetime {
document.add_date(fields["datetime"], &utc);
}

if section.meta.redirect_to.is_none() {
index_page!(section);
for (taxonomy, terms) in page.taxonomies.iter().filter(|(k, _)| ! reserved_field_names.contains(k.as_str())) {
let normalized_taxonomy = normalize_taxonomy_name(taxonomy);
for term in terms.iter() {
document.add_text(fields[&normalized_taxonomy], term);
}
}

for key in &section.pages {
let page = library.get_page_by_key(*key);
index_page!(page);
}
wtr.add_document(document);
}

//wtr.prepare_commit().map_err(|e| { Error::from(format!("tantivy IndexWriter::commit failed: {}", e)) })?;


Loading…
Cancel
Save